Skip to content

Split fastpath IntegerArray constructor and general purpose constructor #22070

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 1 addition & 1 deletion pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
from .period import PeriodArrayMixin # noqa
from .timedeltas import TimedeltaArrayMixin # noqa
from .integer import ( # noqa
IntegerArray, to_integer_array)
IntegerArray, integer_array)
85 changes: 52 additions & 33 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.common import (
is_integer, is_scalar, is_float,
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
Expand Down Expand Up @@ -76,7 +77,7 @@ def construct_from_string(cls, string):
"'{}'".format(cls, string))


def to_integer_array(values, dtype=None):
def integer_array(values, dtype=None, copy=False):
"""
Infer and return an integer array of the values.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update doc-string


Expand All @@ -85,6 +86,7 @@ def to_integer_array(values, dtype=None):
values : 1D list-like
dtype : dtype, optional
dtype to coerce
copy : boolean, default False

Returns
-------
Expand All @@ -94,7 +96,8 @@ def to_integer_array(values, dtype=None):
------
TypeError if incompatible types
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there still a ref to to_integer_array? (I see it in the diff, but I also see that you changed it above)

return IntegerArray(values, dtype=dtype, copy=False)
values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
return IntegerArray(values, mask)


def safe_cast(values, dtype, copy):
Expand Down Expand Up @@ -133,6 +136,11 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
-------
tuple of (values, mask)
"""
# if values is integer numpy array, preserve it's dtype
if dtype is None and hasattr(values, 'dtype'):
if is_integer_dtype(values.dtype):
dtype = values.dtype

if dtype is not None:
if not issubclass(type(dtype), _IntegerDtype):
try:
Expand Down Expand Up @@ -174,10 +182,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):

# infer dtype if needed
if dtype is None:
if is_integer_dtype(values):
dtype = values.dtype
else:
dtype = np.dtype('int64')
dtype = np.dtype('int64')
else:
dtype = dtype.type

Expand All @@ -197,47 +202,62 @@ def coerce_to_array(values, dtype, mask=None, copy=False):

class IntegerArray(ExtensionArray, ExtensionOpsMixin):
"""
We represent an IntegerArray with 2 numpy arrays
Array of integer (optional missing) values.

We represent an IntegerArray with 2 numpy arrays:

- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, False is missing
- mask: a boolean array holding a mask on the data, True is missing

To construct an IntegerArray from generic array-like input, use
``integer_array`` function instead.

Parameters
----------
values : integer 1D numpy array
mask : boolean 1D numpy array
copy : bool, default False

Returns
-------
IntegerArray

"""

@cache_readonly
def dtype(self):
return _dtypes[str(self._data.dtype)]

def __init__(self, values, mask=None, dtype=None, copy=False):
"""
Parameters
----------
values : 1D list-like / IntegerArray
mask : 1D list-like, optional
dtype : subclass of _IntegerDtype, optional
copy : bool, default False
def __init__(self, values, mask, copy=False):
if not (isinstance(values, np.ndarray)
and is_integer_dtype(values.dtype)):
raise TypeError("values should be integer numpy array. Use "
"the 'integer_array' function instead")
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
raise TypeError("mask should be boolean numpy array. Use "
"the 'integer_array' function instead")

Returns
-------
IntegerArray
"""
self._data, self._mask = coerce_to_array(
values, dtype=dtype, mask=mask, copy=copy)
if copy:
values = values.copy()
mask = mask.copy()

self._data = values
self._mask = mask

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(scalars, dtype=dtype, copy=copy)
return integer_array(scalars, dtype=dtype, copy=copy)

@classmethod
def _from_factorized(cls, values, original):
return cls(values, dtype=original.dtype)
return integer_array(values, dtype=original.dtype)

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]
return type(self)(self._data[item],
mask=self._mask[item],
dtype=self.dtype)
return type(self)(self._data[item], self._mask[item])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback additional question: what do you find of writing IntegerArray(...) instead of type(self)(...) ?

Python perfectly allows that (and is the same here, as we don't subclass this one further), and I personally find that easier to read.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like using type() as I expect this to be subclasses for BooleanArray, though it could be that this not needed for that, but want to keep open that possiblitiy.


def _coerce_to_ndarray(self):
"""
Expand Down Expand Up @@ -294,7 +314,7 @@ def take(self, indexer, allow_fill=False, fill_value=None):
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return type(self)(result, mask=mask, dtype=self.dtype, copy=False)
return type(self)(result, mask, copy=False)

def copy(self, deep=False):
data, mask = self._data, self._mask
Expand All @@ -304,7 +324,7 @@ def copy(self, deep=False):
else:
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, dtype=self.dtype, copy=False)
return type(self)(data, mask, copy=False)

def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
Expand Down Expand Up @@ -356,7 +376,7 @@ def _na_value(self):
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask=mask, dtype=to_concat[0].dtype)
return cls(data, mask)

def astype(self, dtype, copy=True):
"""Cast to a NumPy array or IntegerArray with 'dtype'.
Expand Down Expand Up @@ -386,8 +406,7 @@ def astype(self, dtype, copy=True):
if isinstance(dtype, _IntegerDtype):
result = self._data.astype(dtype.numpy_dtype,
casting='same_kind', copy=False)
return type(self)(result, mask=self._mask,
dtype=dtype, copy=False)
return type(self)(result, mask=self._mask, copy=False)

# coerce
data = self._coerce_to_ndarray()
Expand Down Expand Up @@ -523,7 +542,7 @@ def _maybe_mask_result(self, result, mask, other, op_name):
result[mask] = np.nan
return result

return type(self)(result, mask=mask, dtype=self.dtype, copy=False)
return type(self)(result, mask, copy=False)

@classmethod
def _create_arithmetic_method(cls, op):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
if not (dtype is None or is_object_dtype(dtype)):

# coerce to the provided dtype
data = dtype.construct_array_type()(
data = dtype.construct_array_type()._from_sequence(
data, dtype=dtype, copy=False)

# coerce to the object dtype
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4111,7 +4111,7 @@ def _try_cast(arr, take_fast_path):
ordered=dtype.ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()
array_type = dtype.construct_array_type()._from_sequence
subarr = array_type(subarr, dtype=dtype, copy=copy)

elif dtype is not None and raise_cast_failure:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def test_take_series(self, data):
s = pd.Series(data)
result = s.take([0, -1])
expected = pd.Series(
data._from_sequence([data[0], data[len(data) - 1]]),
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
index=[0, len(data) - 1])
self.assert_series_equal(result, expected)

Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/extension/base/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def test_fillna_series(self, data_missing):
ser = pd.Series(data_missing)

result = ser.fillna(fill_value)
expected = pd.Series(
data_missing._from_sequence([fill_value, fill_value]))
expected = pd.Series(data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype))
self.assert_series_equal(result, expected)

# Fill with a series
Expand All @@ -94,11 +94,11 @@ def test_fillna_series_method(self, data_missing, method):
fill_value = data_missing[1]

if method == 'ffill':
data_missing = type(data_missing)(data_missing[::-1])
data_missing = data_missing[::-1]

result = pd.Series(data_missing).fillna(method=method)
expected = pd.Series(
data_missing._from_sequence([fill_value, fill_value]))
expected = pd.Series(data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype))

self.assert_series_equal(result, expected)

Expand All @@ -111,7 +111,8 @@ def test_fillna_frame(self, data_missing):
}).fillna(fill_value)

expected = pd.DataFrame({
"A": data_missing._from_sequence([fill_value, fill_value]),
"A": data_missing._from_sequence([fill_value, fill_value],
dtype=data_missing.dtype),
"B": [1, 2],
})

Expand Down
Loading