diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 9132c74091410..29f258bf1b29e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -7,4 +7,4 @@ from .period import PeriodArrayMixin # noqa from .timedeltas import TimedeltaArrayMixin # noqa from .integer import ( # noqa - IntegerArray, to_integer_array) + IntegerArray, integer_array) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c126117060c3d..48a5db7793921 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( is_integer, is_scalar, is_float, + is_bool_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, @@ -76,7 +77,7 @@ def construct_from_string(cls, string): "'{}'".format(cls, string)) -def to_integer_array(values, dtype=None): +def integer_array(values, dtype=None, copy=False): """ Infer and return an integer array of the values. @@ -85,6 +86,7 @@ def to_integer_array(values, dtype=None): values : 1D list-like dtype : dtype, optional dtype to coerce + copy : boolean, default False Returns ------- @@ -94,7 +96,8 @@ def to_integer_array(values, dtype=None): ------ TypeError if incompatible types """ - return IntegerArray(values, dtype=dtype, copy=False) + values, mask = coerce_to_array(values, dtype=dtype, copy=copy) + return IntegerArray(values, mask) def safe_cast(values, dtype, copy): @@ -133,6 +136,11 @@ def coerce_to_array(values, dtype, mask=None, copy=False): ------- tuple of (values, mask) """ + # if values is integer numpy array, preserve it's dtype + if dtype is None and hasattr(values, 'dtype'): + if is_integer_dtype(values.dtype): + dtype = values.dtype + if dtype is not None: if not issubclass(type(dtype), _IntegerDtype): try: @@ -174,10 +182,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): # infer dtype if needed if dtype is None: - if is_integer_dtype(values): - dtype = values.dtype - else: - dtype = np.dtype('int64') + dtype = np.dtype('int64') else: dtype = dtype.type @@ -197,47 +202,62 @@ def coerce_to_array(values, dtype, mask=None, copy=False): class IntegerArray(ExtensionArray, ExtensionOpsMixin): """ - We represent an IntegerArray with 2 numpy arrays + Array of integer (optional missing) values. + + We represent an IntegerArray with 2 numpy arrays: + - data: contains a numpy integer array of the appropriate dtype - - mask: a boolean array holding a mask on the data, False is missing + - mask: a boolean array holding a mask on the data, True is missing + + To construct an IntegerArray from generic array-like input, use + ``integer_array`` function instead. + + Parameters + ---------- + values : integer 1D numpy array + mask : boolean 1D numpy array + copy : bool, default False + + Returns + ------- + IntegerArray + """ @cache_readonly def dtype(self): return _dtypes[str(self._data.dtype)] - def __init__(self, values, mask=None, dtype=None, copy=False): - """ - Parameters - ---------- - values : 1D list-like / IntegerArray - mask : 1D list-like, optional - dtype : subclass of _IntegerDtype, optional - copy : bool, default False + def __init__(self, values, mask, copy=False): + if not (isinstance(values, np.ndarray) + and is_integer_dtype(values.dtype)): + raise TypeError("values should be integer numpy array. Use " + "the 'integer_array' function instead") + if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + raise TypeError("mask should be boolean numpy array. Use " + "the 'integer_array' function instead") - Returns - ------- - IntegerArray - """ - self._data, self._mask = coerce_to_array( - values, dtype=dtype, mask=mask, copy=copy) + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - return cls(scalars, dtype=dtype, copy=copy) + return integer_array(scalars, dtype=dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): - return cls(values, dtype=original.dtype) + return integer_array(values, dtype=original.dtype) def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] - return type(self)(self._data[item], - mask=self._mask[item], - dtype=self.dtype) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self): """ @@ -294,7 +314,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): result[fill_mask] = fill_value mask = mask ^ fill_mask - return type(self)(result, mask=mask, dtype=self.dtype, copy=False) + return type(self)(result, mask, copy=False) def copy(self, deep=False): data, mask = self._data, self._mask @@ -304,7 +324,7 @@ def copy(self, deep=False): else: data = data.copy() mask = mask.copy() - return type(self)(data, mask, dtype=self.dtype, copy=False) + return type(self)(data, mask, copy=False) def __setitem__(self, key, value): _is_scalar = is_scalar(value) @@ -356,7 +376,7 @@ def _na_value(self): def _concat_same_type(cls, to_concat): data = np.concatenate([x._data for x in to_concat]) mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask=mask, dtype=to_concat[0].dtype) + return cls(data, mask) def astype(self, dtype, copy=True): """Cast to a NumPy array or IntegerArray with 'dtype'. @@ -386,8 +406,7 @@ def astype(self, dtype, copy=True): if isinstance(dtype, _IntegerDtype): result = self._data.astype(dtype.numpy_dtype, casting='same_kind', copy=False) - return type(self)(result, mask=self._mask, - dtype=dtype, copy=False) + return type(self)(result, mask=self._mask, copy=False) # coerce data = self._coerce_to_ndarray() @@ -523,7 +542,7 @@ def _maybe_mask_result(self, result, mask, other, op_name): result[mask] = np.nan return result - return type(self)(result, mask=mask, dtype=self.dtype, copy=False) + return type(self)(result, mask, copy=False) @classmethod def _create_arithmetic_method(cls, op): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bfa669a0ca164..5a026fe19a952 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -300,7 +300,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype - data = dtype.construct_array_type()( + data = dtype.construct_array_type()._from_sequence( data, dtype=dtype, copy=False) # coerce to the object dtype diff --git a/pandas/core/series.py b/pandas/core/series.py index 4b4fccccda4a0..090a63dd0cb9f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4111,7 +4111,7 @@ def _try_cast(arr, take_fast_path): ordered=dtype.ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype - array_type = dtype.construct_array_type() + array_type = dtype.construct_array_type()._from_sequence subarr = array_type(subarr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 886a0f66b5f66..22b21102fa4ae 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -213,7 +213,7 @@ def test_take_series(self, data): s = pd.Series(data) result = s.take([0, -1]) expected = pd.Series( - data._from_sequence([data[0], data[len(data) - 1]]), + data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), index=[0, len(data) - 1]) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 43b2702c72193..2fe547e50a34b 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -77,8 +77,8 @@ def test_fillna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.fillna(fill_value) - expected = pd.Series( - data_missing._from_sequence([fill_value, fill_value])) + expected = pd.Series(data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype)) self.assert_series_equal(result, expected) # Fill with a series @@ -94,11 +94,11 @@ def test_fillna_series_method(self, data_missing, method): fill_value = data_missing[1] if method == 'ffill': - data_missing = type(data_missing)(data_missing[::-1]) + data_missing = data_missing[::-1] result = pd.Series(data_missing).fillna(method=method) - expected = pd.Series( - data_missing._from_sequence([fill_value, fill_value])) + expected = pd.Series(data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype)) self.assert_series_equal(result, expected) @@ -111,7 +111,8 @@ def test_fillna_frame(self, data_missing): }).fillna(fill_value) expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value]), + "A": data_missing._from_sequence([fill_value, fill_value], + dtype=data_missing.dtype), "B": [1, 2], }) diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 5e0f5bf0a5dcf..e3bba3f275b73 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.generic import ABCIndexClass from pandas.core.arrays import ( - to_integer_array, IntegerArray) + integer_array, IntegerArray) from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) @@ -31,12 +31,12 @@ def dtype(request): @pytest.fixture def data(dtype): - return IntegerArray(make_data(), dtype=dtype) + return integer_array(make_data(), dtype=dtype) @pytest.fixture def data_missing(dtype): - return IntegerArray([np.nan, 1], dtype=dtype) + return integer_array([np.nan, 1], dtype=dtype) @pytest.fixture @@ -49,12 +49,12 @@ def gen(count): @pytest.fixture def data_for_sorting(dtype): - return IntegerArray([1, 2, 0], dtype=dtype) + return integer_array([1, 2, 0], dtype=dtype) @pytest.fixture def data_missing_for_sorting(dtype): - return IntegerArray([1, np.nan, 0], dtype=dtype) + return integer_array([1, np.nan, 0], dtype=dtype) @pytest.fixture @@ -74,7 +74,7 @@ def data_for_grouping(dtype): a = 0 c = 2 na = np.nan - return IntegerArray([b, b, na, na, a, a, b, c], dtype=dtype) + return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) def test_dtypes(dtype): @@ -494,8 +494,7 @@ def test_construct_index(self, all_data, dropna): else: other = all_data - result = pd.Index(IntegerArray(other, - dtype=all_data.dtype)) + result = pd.Index(integer_array(other, dtype=all_data.dtype)) expected = pd.Index(other, dtype=object) self.assert_index_equal(result, expected) @@ -584,14 +583,14 @@ def test_construct_cast_invalid(self, dtype): msg = "cannot safely" arr = [1.2, 2.3, 3.7] with tm.assert_raises_regex(TypeError, msg): - IntegerArray(arr, dtype=dtype) + integer_array(arr, dtype=dtype) with tm.assert_raises_regex(TypeError, msg): pd.Series(arr).astype(dtype) arr = [1.2, 2.3, 3.7, np.nan] with tm.assert_raises_regex(TypeError, msg): - IntegerArray(arr, dtype=dtype) + integer_array(arr, dtype=dtype) with tm.assert_raises_regex(TypeError, msg): pd.Series(arr).astype(dtype) @@ -650,10 +649,45 @@ def test_conversions(data_missing): assert type(r) == type(e) +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype='int64') + mask = np.array([False, False, False, True], dtype='bool') + + result = IntegerArray(values, mask) + expected = integer_array([1, 2, 3, np.nan], dtype='int64') + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError): + IntegerArray(values.astype(float), mask) + + with pytest.raises(TypeError): + IntegerArray(values) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype='int64') + mask = np.array([False, False, False, True], dtype='bool') + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + @pytest.mark.parametrize( 'values', [ ['foo', 'bar'], + ['1', '2'], 'foo', 1, 1.0, @@ -662,7 +696,41 @@ def test_conversions(data_missing): def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays with pytest.raises(TypeError): - to_integer_array(values) + integer_array(values) + + +def test_to_integer_array_inferred_dtype(): + # if values has dtype -> respect it + result = integer_array(np.array([1, 2], dtype='int8')) + assert result.dtype == Int8Dtype() + result = integer_array(np.array([1, 2], dtype='int32')) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = integer_array([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(): + result = integer_array([1, 2], dtype='int8') + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = integer_array(np.array([1, 2], dtype='int8'), dtype='int32') + assert result.dtype == Int32Dtype() + + +def test_to_integer_array_float(): + result = integer_array([1., 2.]) + expected = integer_array([1, 2]) + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + integer_array([1.5, 2.]) + + # for float dtypes, the itemsize is not preserved + result = integer_array(np.array([1., 2.], dtype='float32')) + assert result.dtype == Int64Dtype() @pytest.mark.parametrize( @@ -673,8 +741,9 @@ def test_to_integer_array_error(values): (np.array([1, np.nan]), 'int8', Int8Dtype)]) def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays - result = to_integer_array(values, dtype=to_dtype) - expected = IntegerArray(values, dtype=result_dtype()) + result = integer_array(values, dtype=to_dtype) + assert result.dtype == result_dtype() + expected = integer_array(values, dtype=result_dtype()) tm.assert_extension_array_equal(result, expected)