Skip to content

Commit 83be235

Browse files
jorisvandenbosschejreback
authored andcommitted
Split fastpath IntegerArray constructor and general purpose constructor (#22070)
1 parent 859d895 commit 83be235

File tree

7 files changed

+145
-56
lines changed

7 files changed

+145
-56
lines changed

pandas/core/arrays/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
from .period import PeriodArrayMixin # noqa
88
from .timedeltas import TimedeltaArrayMixin # noqa
99
from .integer import ( # noqa
10-
IntegerArray, to_integer_array)
10+
IntegerArray, integer_array)

pandas/core/arrays/integer.py

+52-33
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
1212
from pandas.core.dtypes.common import (
1313
is_integer, is_scalar, is_float,
14+
is_bool_dtype,
1415
is_float_dtype,
1516
is_integer_dtype,
1617
is_object_dtype,
@@ -76,7 +77,7 @@ def construct_from_string(cls, string):
7677
"'{}'".format(cls, string))
7778

7879

79-
def to_integer_array(values, dtype=None):
80+
def integer_array(values, dtype=None, copy=False):
8081
"""
8182
Infer and return an integer array of the values.
8283
@@ -85,6 +86,7 @@ def to_integer_array(values, dtype=None):
8586
values : 1D list-like
8687
dtype : dtype, optional
8788
dtype to coerce
89+
copy : boolean, default False
8890
8991
Returns
9092
-------
@@ -94,7 +96,8 @@ def to_integer_array(values, dtype=None):
9496
------
9597
TypeError if incompatible types
9698
"""
97-
return IntegerArray(values, dtype=dtype, copy=False)
99+
values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
100+
return IntegerArray(values, mask)
98101

99102

100103
def safe_cast(values, dtype, copy):
@@ -133,6 +136,11 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
133136
-------
134137
tuple of (values, mask)
135138
"""
139+
# if values is integer numpy array, preserve it's dtype
140+
if dtype is None and hasattr(values, 'dtype'):
141+
if is_integer_dtype(values.dtype):
142+
dtype = values.dtype
143+
136144
if dtype is not None:
137145
if not issubclass(type(dtype), _IntegerDtype):
138146
try:
@@ -174,10 +182,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
174182

175183
# infer dtype if needed
176184
if dtype is None:
177-
if is_integer_dtype(values):
178-
dtype = values.dtype
179-
else:
180-
dtype = np.dtype('int64')
185+
dtype = np.dtype('int64')
181186
else:
182187
dtype = dtype.type
183188

@@ -197,47 +202,62 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
197202

198203
class IntegerArray(ExtensionArray, ExtensionOpsMixin):
199204
"""
200-
We represent an IntegerArray with 2 numpy arrays
205+
Array of integer (optional missing) values.
206+
207+
We represent an IntegerArray with 2 numpy arrays:
208+
201209
- data: contains a numpy integer array of the appropriate dtype
202-
- mask: a boolean array holding a mask on the data, False is missing
210+
- mask: a boolean array holding a mask on the data, True is missing
211+
212+
To construct an IntegerArray from generic array-like input, use
213+
``integer_array`` function instead.
214+
215+
Parameters
216+
----------
217+
values : integer 1D numpy array
218+
mask : boolean 1D numpy array
219+
copy : bool, default False
220+
221+
Returns
222+
-------
223+
IntegerArray
224+
203225
"""
204226

205227
@cache_readonly
206228
def dtype(self):
207229
return _dtypes[str(self._data.dtype)]
208230

209-
def __init__(self, values, mask=None, dtype=None, copy=False):
210-
"""
211-
Parameters
212-
----------
213-
values : 1D list-like / IntegerArray
214-
mask : 1D list-like, optional
215-
dtype : subclass of _IntegerDtype, optional
216-
copy : bool, default False
231+
def __init__(self, values, mask, copy=False):
232+
if not (isinstance(values, np.ndarray)
233+
and is_integer_dtype(values.dtype)):
234+
raise TypeError("values should be integer numpy array. Use "
235+
"the 'integer_array' function instead")
236+
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
237+
raise TypeError("mask should be boolean numpy array. Use "
238+
"the 'integer_array' function instead")
217239

218-
Returns
219-
-------
220-
IntegerArray
221-
"""
222-
self._data, self._mask = coerce_to_array(
223-
values, dtype=dtype, mask=mask, copy=copy)
240+
if copy:
241+
values = values.copy()
242+
mask = mask.copy()
243+
244+
self._data = values
245+
self._mask = mask
224246

225247
@classmethod
226248
def _from_sequence(cls, scalars, dtype=None, copy=False):
227-
return cls(scalars, dtype=dtype, copy=copy)
249+
return integer_array(scalars, dtype=dtype, copy=copy)
228250

229251
@classmethod
230252
def _from_factorized(cls, values, original):
231-
return cls(values, dtype=original.dtype)
253+
return integer_array(values, dtype=original.dtype)
232254

233255
def __getitem__(self, item):
234256
if is_integer(item):
235257
if self._mask[item]:
236258
return self.dtype.na_value
237259
return self._data[item]
238-
return type(self)(self._data[item],
239-
mask=self._mask[item],
240-
dtype=self.dtype)
260+
return type(self)(self._data[item], self._mask[item])
241261

242262
def _coerce_to_ndarray(self):
243263
"""
@@ -294,7 +314,7 @@ def take(self, indexer, allow_fill=False, fill_value=None):
294314
result[fill_mask] = fill_value
295315
mask = mask ^ fill_mask
296316

297-
return type(self)(result, mask=mask, dtype=self.dtype, copy=False)
317+
return type(self)(result, mask, copy=False)
298318

299319
def copy(self, deep=False):
300320
data, mask = self._data, self._mask
@@ -304,7 +324,7 @@ def copy(self, deep=False):
304324
else:
305325
data = data.copy()
306326
mask = mask.copy()
307-
return type(self)(data, mask, dtype=self.dtype, copy=False)
327+
return type(self)(data, mask, copy=False)
308328

309329
def __setitem__(self, key, value):
310330
_is_scalar = is_scalar(value)
@@ -356,7 +376,7 @@ def _na_value(self):
356376
def _concat_same_type(cls, to_concat):
357377
data = np.concatenate([x._data for x in to_concat])
358378
mask = np.concatenate([x._mask for x in to_concat])
359-
return cls(data, mask=mask, dtype=to_concat[0].dtype)
379+
return cls(data, mask)
360380

361381
def astype(self, dtype, copy=True):
362382
"""Cast to a NumPy array or IntegerArray with 'dtype'.
@@ -386,8 +406,7 @@ def astype(self, dtype, copy=True):
386406
if isinstance(dtype, _IntegerDtype):
387407
result = self._data.astype(dtype.numpy_dtype,
388408
casting='same_kind', copy=False)
389-
return type(self)(result, mask=self._mask,
390-
dtype=dtype, copy=False)
409+
return type(self)(result, mask=self._mask, copy=False)
391410

392411
# coerce
393412
data = self._coerce_to_ndarray()
@@ -523,7 +542,7 @@ def _maybe_mask_result(self, result, mask, other, op_name):
523542
result[mask] = np.nan
524543
return result
525544

526-
return type(self)(result, mask=mask, dtype=self.dtype, copy=False)
545+
return type(self)(result, mask, copy=False)
527546

528547
@classmethod
529548
def _create_arithmetic_method(cls, op):

pandas/core/indexes/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
300300
if not (dtype is None or is_object_dtype(dtype)):
301301

302302
# coerce to the provided dtype
303-
data = dtype.construct_array_type()(
303+
data = dtype.construct_array_type()._from_sequence(
304304
data, dtype=dtype, copy=False)
305305

306306
# coerce to the object dtype

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4098,7 +4098,7 @@ def _try_cast(arr, take_fast_path):
40984098
ordered=dtype.ordered)
40994099
elif is_extension_array_dtype(dtype):
41004100
# create an extension array from its dtype
4101-
array_type = dtype.construct_array_type()
4101+
array_type = dtype.construct_array_type()._from_sequence
41024102
subarr = array_type(subarr, dtype=dtype, copy=copy)
41034103

41044104
elif dtype is not None and raise_cast_failure:

pandas/tests/extension/base/getitem.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def test_take_series(self, data):
213213
s = pd.Series(data)
214214
result = s.take([0, -1])
215215
expected = pd.Series(
216-
data._from_sequence([data[0], data[len(data) - 1]]),
216+
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
217217
index=[0, len(data) - 1])
218218
self.assert_series_equal(result, expected)
219219

pandas/tests/extension/base/missing.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def test_fillna_series(self, data_missing):
7777
ser = pd.Series(data_missing)
7878

7979
result = ser.fillna(fill_value)
80-
expected = pd.Series(
81-
data_missing._from_sequence([fill_value, fill_value]))
80+
expected = pd.Series(data_missing._from_sequence(
81+
[fill_value, fill_value], dtype=data_missing.dtype))
8282
self.assert_series_equal(result, expected)
8383

8484
# Fill with a series
@@ -94,11 +94,11 @@ def test_fillna_series_method(self, data_missing, method):
9494
fill_value = data_missing[1]
9595

9696
if method == 'ffill':
97-
data_missing = type(data_missing)(data_missing[::-1])
97+
data_missing = data_missing[::-1]
9898

9999
result = pd.Series(data_missing).fillna(method=method)
100-
expected = pd.Series(
101-
data_missing._from_sequence([fill_value, fill_value]))
100+
expected = pd.Series(data_missing._from_sequence(
101+
[fill_value, fill_value], dtype=data_missing.dtype))
102102

103103
self.assert_series_equal(result, expected)
104104

@@ -111,7 +111,8 @@ def test_fillna_frame(self, data_missing):
111111
}).fillna(fill_value)
112112

113113
expected = pd.DataFrame({
114-
"A": data_missing._from_sequence([fill_value, fill_value]),
114+
"A": data_missing._from_sequence([fill_value, fill_value],
115+
dtype=data_missing.dtype),
115116
"B": [1, 2],
116117
})
117118

0 commit comments

Comments
 (0)