Skip to content

Commit 7959eb6

Browse files
committed
API: Public data attributes for EA-backed containers
This adds two new methods for working with EA-backed Series / Index. - `.array -> Union[ExtensionArray, ndarray]`: the actual backing array - `.to_numpy() -> ndarray`: A NumPy representation of the data `.array` is always a reference to the actual data stored in the container. Updating it inplace (not recommended) will be reflected in the Series (or Index for that matter, so really not recommended). `to_numpy()` may (or may not) require data copying / coercion. Closes pandas-dev#19954
1 parent d0691e0 commit 7959eb6

File tree

4 files changed

+164
-1
lines changed

4 files changed

+164
-1
lines changed

pandas/core/base.py

+91
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,97 @@ def base(self):
765765
FutureWarning, stacklevel=2)
766766
return self.values.base
767767

768+
@property
769+
def array(self):
770+
# type: () -> Union[np.ndarray, ExtensionArray]
771+
"""The actual Array backing this Series or Index.
772+
773+
Returns
774+
-------
775+
Union[ndarray, ExtensionArray]
776+
This is the actual array stored within this object.
777+
778+
Notes
779+
-----
780+
This table lays out the different array types for each extension
781+
dtype within pandas.
782+
783+
================== =============================
784+
dtype array type
785+
================== =============================
786+
category Categorical
787+
period PeriodArray
788+
interval IntervalArray
789+
IntegerNA IntegerArray
790+
datetime64[ns, tz] datetime64[ns]? DatetimeArray
791+
================== =============================
792+
793+
For any 3rd-party extension types, the array type will be an
794+
ExtensionArray.
795+
796+
All remaining arrays (ndarrays), ``.array`` will be the ndarray
797+
stored within.
798+
799+
See Also
800+
--------
801+
to_numpy : Similar method that always returns a NumPy array.
802+
803+
Examples
804+
--------
805+
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
806+
>>> ser.array
807+
[a, b, a]
808+
Categories (2, object): [a, b]
809+
"""
810+
return self._values
811+
812+
def to_numpy(self):
813+
"""A NumPy array representing the values in this Series or Index.
814+
815+
The returned array will be the same up to equality (values equal
816+
in `self` will be equal in the returned array; likewise for values
817+
that are not equal).
818+
819+
Returns
820+
-------
821+
numpy.ndarray
822+
An ndarray with
823+
824+
Notes
825+
-----
826+
For NumPy arrays, this will be a reference to the actual data stored
827+
in this Series or Index.
828+
829+
For extension types, this may involve copying data and coercing the
830+
result to a NumPy type (possibly object), which may be expensive.
831+
832+
This table lays out the different array types for each extension
833+
dtype within pandas.
834+
835+
================== ================================
836+
dtype array type
837+
================== ================================
838+
category[T] ndarray[T] (same dtype as input)
839+
period ndarray[object] (Periods)
840+
interval ndarray[object] (Intervals)
841+
IntegerNA IntegerArray[object]
842+
datetime64[ns, tz] datetime64[ns]? object?
843+
================== ================================
844+
845+
See Also
846+
--------
847+
array : Get the actual data stored within.
848+
849+
Examples
850+
--------
851+
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
852+
>>> ser.to_numpy()
853+
array(['a', 'b', 'a'], dtype=object)
854+
"""
855+
if is_extension_array_dtype(self.dtype):
856+
return np.asarray(self._values)
857+
return self._values
858+
768859
@property
769860
def _ndarray_values(self):
770861
# type: () -> np.ndarray

pandas/core/indexes/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,7 @@ def values(self):
710710
@property
711711
def _values(self):
712712
# type: () -> Union[ExtensionArray, Index]
713+
# TODO: remove in favor of .array
713714
# TODO(EA): remove index types as they become extension arrays
714715
"""The best array representation.
715716
@@ -739,7 +740,7 @@ def _values(self):
739740
values
740741
_ndarray_values
741742
"""
742-
return self.values
743+
return self._data
743744

744745
def get_values(self):
745746
"""

pandas/core/indexes/multi.py

+20
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,26 @@ def _verify_integrity(self, labels=None, levels=None):
288288
def levels(self):
289289
return self._levels
290290

291+
@property
292+
def _values(self):
293+
# TODO: remove
294+
# We override here, since our parent uses _data, which we dont' use.
295+
return self.values
296+
297+
@property
298+
def array(self):
299+
"""
300+
Raises a ValueError for `MultiIndex` because there's no single
301+
array backing a MultiIndex.
302+
303+
Raises
304+
------
305+
ValueError
306+
"""
307+
msg = ("MultiIndex has no single backing array. Use "
308+
"'MultiIndex.to_numpy()' to get a NumPy array of tuples.")
309+
raise ValueError(msg)
310+
291311
@property
292312
def _is_homogeneous_type(self):
293313
"""Whether the levels of a MultiIndex all have the same dtype.

pandas/tests/test_base.py

+51
Original file line numberDiff line numberDiff line change
@@ -1243,3 +1243,54 @@ def test_ndarray_values(array, expected):
12431243
r_values = pd.Index(array)._ndarray_values
12441244
tm.assert_numpy_array_equal(l_values, r_values)
12451245
tm.assert_numpy_array_equal(l_values, expected)
1246+
1247+
1248+
@pytest.mark.parametrize("array, attr", [
1249+
(np.array([1, 2], dtype=np.int64), None),
1250+
(pd.Categorical(['a', 'b']), '_codes'),
1251+
(pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'),
1252+
(pd.core.arrays.integer_array([0, np.nan]), '_data'),
1253+
(pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'),
1254+
(pd.SparseArray([0, 1]), '_sparse_values'),
1255+
# TODO: DatetimeArray(add)
1256+
])
1257+
@pytest.mark.parametrize('box', [pd.Series, pd.Index])
1258+
def test_array(array, attr, box):
1259+
if array.dtype.name in ('Int64', 'Sparse[int64, 0]'):
1260+
pytest.skip("No index type for {}".format(array.dtype))
1261+
result = box(array, copy=False).array
1262+
1263+
if attr:
1264+
array = getattr(array, attr)
1265+
result = getattr(result, attr)
1266+
1267+
assert result is array
1268+
1269+
1270+
def test_array_multiindex_raises():
1271+
idx = pd.MultiIndex.from_product([['A'], ['a', 'b']])
1272+
with tm.assert_raises_regex(ValueError, 'MultiIndex'):
1273+
idx.array
1274+
1275+
1276+
@pytest.mark.parametrize('array, expected', [
1277+
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
1278+
(pd.Categorical(['a', 'b']), np.array(['a', 'b'], dtype=object)),
1279+
(pd.core.arrays.period_array(['2000', '2001'], freq='D'),
1280+
np.array([pd.Period('2000', freq="D"), pd.Period('2001', freq='D')])),
1281+
(pd.core.arrays.integer_array([0, np.nan]),
1282+
np.array([1, np.nan], dtype=object)),
1283+
(pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]),
1284+
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)),
1285+
(pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
1286+
# TODO: DatetimeArray(add)
1287+
])
1288+
@pytest.mark.parametrize('box', [pd.Series, pd.Index])
1289+
def test_to_numpy(array, expected, box):
1290+
thing = box(array)
1291+
1292+
if array.dtype.name in ('Int64', 'Sparse[int64, 0]'):
1293+
pytest.skip("No index type for {}".format(array.dtype))
1294+
1295+
result = thing.to_numpy()
1296+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)