Skip to content

Commit 932e119

Browse files
committed
Updates for PandasArray
1 parent 36c6f00 commit 932e119

File tree

3 files changed

+77
-47
lines changed

3 files changed

+77
-47
lines changed

doc/source/whatsnew/v0.24.0.rst

+16-5
Original file line numberDiff line numberDiff line change
@@ -165,18 +165,29 @@ Reduction and groupby operations such as 'sum' work.
165165

166166
A new top-level method :func:`array` has been added for creating 1-dimensional arrays (:issue:`22860`).
167167
This can be used to create any :ref:`extension array <extending.extension-types>`, including
168-
extension arrays registered by :ref:`3rd party libraries <ecosystem.extensions>`, or to
169-
create NumPy arrays.
168+
extension arrays registered by :ref:`3rd party libraries <ecosystem.extensions>`.
170169

171170
.. ipython:: python
172171
173172
pd.array([1, 2, np.nan], dtype='Int64')
174173
pd.array(['a', 'b', 'c'], dtype='category')
175-
pd.array([1, 2])
176174
177-
Notice that the default return value, if no ``dtype`` is specified, the type of
175+
Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)
176+
will return a new :class:`arrays.PandasArray`, which is just a thin (no-copy)
177+
wrapper around a :class:`numpy.ndarray` that satisfies the extension array interface.
178+
179+
.. ipython:: python
180+
181+
pd.array([1, 2, 3])
182+
183+
On their own, a :class:`arrays.PandasArray` isn't a very useful object.
184+
But if you need write low-level code that works generically for any
185+
:class:`~pandas.api.extensions.ExtensionArray`, :class:`arrays.PandasArray`
186+
satisfies that need.
187+
188+
Notice that by default, if no ``dtype`` is specified, the dtype of the returned
178189
array is inferred from the data. In particular, note that the first example of
179-
``[1, 2, np.nan]`` will return a floating-point NumPy array, since ``NaN``
190+
``[1, 2, np.nan]`` would have returned a floating-point array, since ``NaN``
180191
is a float.
181192

182193
.. ipython:: python

pandas/core/arrays/array_.py

+33-18
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def array(data, # type: Sequence[object]
1313
dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]]
1414
copy=True, # type: bool
1515
):
16-
# type: (...) -> Union[str, np.dtype, ExtensionDtype]
16+
# type: (...) -> ExtensionArray
1717
"""
1818
Create an array.
1919
@@ -58,20 +58,27 @@ def array(data, # type: Sequence[object]
5858
5959
For all other cases, NumPy's usual inference rules will be used.
6060
61-
To avoid *future* breaking changing, pandas recommends using actual
62-
dtypes, and not string aliases, for `dtype`. In other words, use
61+
To avoid *future* breaking changes, when the underlying memory
62+
representation of the returned array matters, we recommend specifying
63+
the `dtype` as a concrete object rather than a string alias or
64+
allowing it to be inferred. For example, a future version of pandas
65+
or a 3rd-party library may include a dedicated ExtensionArray for
66+
string data. In this event, the following would no longer return a
67+
:class:`PandasArray` backed by a NumPy array.
6368
64-
>>> pd.array([1, 2, 3], dtype=np.dtype("int32"))
65-
array([1, 2, 3], dtype=int32)
69+
>>> pd.array(['a', 'b'], dtype=str)
70+
<PandasArray>
71+
['a', 'b']
72+
Length: 2, dtype: str32
6673
67-
rather than
74+
This would instead return the new ExtensionArray dedicated for string
75+
data. If you really need the new array to be backed by a NumPy array,
76+
specify that in the dtype.
6877
69-
>>> pd.array([1, 2, 3], dtype="int32")
70-
array([1, 2, 3], dtype=int32)
71-
72-
If and when pandas switches to a different backend for storing arrays,
73-
the meaning of the string aliases will change, while the actual
74-
dtypes will be unambiguous.
78+
>>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
79+
<PandasArray>
80+
['a', 'b']
81+
Length: 2, dtype: str32
7582
7683
copy : bool, default True
7784
Whether to copy the data, even if not necessary. Depending
@@ -80,7 +87,7 @@ def array(data, # type: Sequence[object]
8087
8188
Returns
8289
-------
83-
array : Union[numpy.ndarray, ExtensionArray]
90+
array : ExtensionArray
8491
8592
Raises
8693
------
@@ -109,12 +116,16 @@ def array(data, # type: Sequence[object]
109116
:meth:`numpy.array`, and an ``ndarray`` is returned.
110117
111118
>>> pd.array([1, 2])
112-
array([1, 2])
119+
<PandasArray>
120+
[1, 2]
121+
Length: 2, dtype: int64
113122
114123
Or the NumPy dtype can be specified
115124
116125
>>> pd.array([1, 2], dtype=np.dtype("int32"))
117-
array([1, 2], dtype=int32)
126+
<PandasArray>
127+
[1, 2]
128+
Length: 2, dtype: int32
118129
119130
You can use the string alias for `dtype`
120131
@@ -134,7 +145,9 @@ def array(data, # type: Sequence[object]
134145
NumPy array.
135146
136147
>>> pd.array([1, 2, np.nan])
137-
array([ 1., 2., nan])
148+
<PandasArray>
149+
[1.0, 2.0, nan]
150+
Length: 3, dtype: float64
138151
139152
To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
140153
the dtype:
@@ -159,7 +172,7 @@ def array(data, # type: Sequence[object]
159172
ValueError: Cannot pass scalar '1' to 'pandas.array'.
160173
"""
161174
from pandas.core.arrays import (
162-
period_array, ExtensionArray, IntervalArray
175+
period_array, ExtensionArray, IntervalArray, PandasArray
163176
)
164177

165178
if lib.is_scalar(data):
@@ -202,4 +215,6 @@ def array(data, # type: Sequence[object]
202215
# TODO(DatetimeArray): handle this type
203216
# TODO(BooleanArray): handle this type
204217

205-
return np.array(data, dtype=dtype, copy=copy)
218+
result = np.array(data, dtype=dtype, copy=copy)
219+
result = PandasArray(result)
220+
return result

pandas/tests/arrays/test_array.py

+28-24
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,22 @@
77

88
import pandas as pd
99
from pandas.api.extensions import register_extension_dtype
10-
from pandas.core.arrays import integer_array, period_array
10+
from pandas.core.arrays import PandasArray, integer_array, period_array
1111
from pandas.tests.extension.decimal import (
1212
DecimalArray, DecimalDtype, to_decimal)
1313
import pandas.util.testing as tm
1414

1515

1616
@pytest.mark.parametrize("data, dtype, expected", [
1717
# Basic NumPy defaults.
18-
([1, 2], None, np.array([1, 2])),
19-
([1, 2], object, np.array([1, 2], dtype=object)),
18+
([1, 2], None, PandasArray(np.array([1, 2]))),
19+
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
2020
([1, 2], np.dtype('float32'),
21-
np.array([1., 2.0], dtype=np.dtype('float32'))),
22-
(np.array([1, 2]), None, np.array([1, 2])),
21+
PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))),
22+
(np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
2323
2424
# String alias passes through to NumPy
25-
([1, 2], 'float32', np.array([1, 2], dtype='float32')),
25+
([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))),
2626
2727
# Period alias
2828
([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]',
@@ -34,7 +34,7 @@
3434
3535
# Datetime (naive)
3636
([1, 2], np.dtype('datetime64[ns]'),
37-
np.array([1, 2], dtype='datetime64[ns]')),
37+
PandasArray(np.array([1, 2], dtype='datetime64[ns]'))),
3838
# TODO(DatetimeArray): add here
3939
4040
# Category
@@ -51,10 +51,10 @@
5151
5252
# IntegerNA
5353
([1, None], 'Int16', integer_array([1, None], dtype='Int16')),
54-
(pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)),
54+
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
5555
5656
# Index
57-
(pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)),
57+
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
5858
5959
# Series[EA] returns the EA
6060
(pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),
@@ -64,10 +64,6 @@
6464
# "3rd party" EAs work
6565
([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])),
6666
67-
# 2D ndarrays pass through
68-
(np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])),
69-
([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])),
70-
7167
# pass an ExtensionArray, but a different dtype
7268
(period_array(['2000', '2001'], freq='D'),
7369
'category',
@@ -82,15 +78,15 @@ def test_array_copy():
8278
a = np.array([1, 2])
8379
# default is to copy
8480
b = pd.array(a)
85-
assert np.shares_memory(a, b) is False
81+
assert np.shares_memory(a, b._ndarray) is False
8682

8783
# copy=True
8884
b = pd.array(a, copy=True)
89-
assert np.shares_memory(a, b) is False
85+
assert np.shares_memory(a, b._ndarray) is False
9086

9187
# copy=False
9288
b = pd.array(a, copy=False)
93-
assert a is b
89+
assert np.shares_memory(a, b._ndarray) is True
9490

9591

9692
@pytest.mark.parametrize('data, expected', [
@@ -112,10 +108,24 @@ def test_array_inference(data, expected):
112108
])
113109
def test_array_inference_fails(data):
114110
result = pd.array(data)
115-
expected = np.array(data, dtype=object)
116-
tm.assert_numpy_array_equal(result, expected)
111+
expected = PandasArray(np.array(data, dtype=object))
112+
tm.assert_extension_array_equal(result, expected)
113+
114+
115+
@pytest.mark.parametrize("data", [
116+
np.array([[1, 2], [3, 4]]),
117+
[[1, 2], [3, 4]],
118+
])
119+
def test_nd_raises(data):
120+
with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'):
121+
pd.array(data)
117122

118123

124+
def test_scalar_raises():
125+
with pytest.raises(ValueError,
126+
match="Cannot pass scalar '1'"):
127+
pd.array(1)
128+
119129
# ---------------------------------------------------------------------------
120130
# A couple dummy classes to ensure that Series and Indexes are unboxed before
121131
# getting to the EA classes.
@@ -169,9 +179,3 @@ def test_array_not_registered(registry_without_decimal):
169179
result = pd.array(data, dtype=DecimalDtype)
170180
expected = DecimalArray._from_sequence(data)
171181
tm.assert_equal(result, expected)
172-
173-
174-
def test_scalar_raises():
175-
with pytest.raises(ValueError,
176-
match="Cannot pass scalar '1'"):
177-
pd.array(1)

0 commit comments

Comments
 (0)