Updates for PandasArray

TomAugspurger · TomAugspurger · commit 932e119c3383 · 2018-12-28T13:06:54.000-06:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -165,18 +165,29 @@ Reduction and groupby operations such as 'sum' work.
 
 A new top-level method :func:`array` has been added for creating 1-dimensional arrays (:issue:`22860`).
 This can be used to create any :ref:`extension array <extending.extension-types>`, including
-extension arrays registered by :ref:`3rd party libraries <ecosystem.extensions>`, or to
-create NumPy arrays.
+extension arrays registered by :ref:`3rd party libraries <ecosystem.extensions>`.
 
 .. ipython:: python
 
    pd.array([1, 2, np.nan], dtype='Int64')
    pd.array(['a', 'b', 'c'], dtype='category')
-   pd.array([1, 2])
 
-Notice that the default return value, if no ``dtype`` is specified, the type of
+Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)
+will return a new :class:`arrays.PandasArray`, which is just a thin (no-copy)
+wrapper around a :class:`numpy.ndarray` that satisfies the extension array interface.
+
+.. ipython:: python
+
+   pd.array([1, 2, 3])
+
+On their own, a :class:`arrays.PandasArray` isn't a very useful object.
+But if you need write low-level code that works generically for any
+:class:`~pandas.api.extensions.ExtensionArray`, :class:`arrays.PandasArray`
+satisfies that need.
+
+Notice that by default, if no ``dtype`` is specified, the dtype of the returned
 array is inferred from the data. In particular, note that the first example of
-``[1, 2, np.nan]`` will return a floating-point NumPy array, since ``NaN``
+``[1, 2, np.nan]`` would have returned a floating-point array, since ``NaN``
 is a float.
 
 .. ipython:: python
diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py
@@ -13,7 +13,7 @@ def array(data,         # type: Sequence[object]
           dtype=None,   # type: Optional[Union[str, np.dtype, ExtensionDtype]]
           copy=True,    # type: bool
           ):
-    # type: (...) -> Union[str, np.dtype, ExtensionDtype]
+    # type: (...) -> ExtensionArray
     """
     Create an array.
 
@@ -58,20 +58,27 @@ def array(data,         # type: Sequence[object]
 
         For all other cases, NumPy's usual inference rules will be used.
 
-        To avoid *future* breaking changing, pandas recommends using actual
-        dtypes, and not string aliases, for `dtype`. In other words, use
+        To avoid *future* breaking changes, when the underlying memory
+        representation of the returned array matters, we recommend specifying
+        the `dtype` as a concrete object rather than a string alias or
+        allowing it to be inferred. For example, a future version of pandas
+        or a 3rd-party library may include a dedicated ExtensionArray for
+        string data. In this event, the following would no longer return a
+        :class:`PandasArray` backed by a NumPy array.
 
-        >>> pd.array([1, 2, 3], dtype=np.dtype("int32"))
-        array([1, 2, 3], dtype=int32)
+        >>> pd.array(['a', 'b'], dtype=str)
+        <PandasArray>
+        ['a', 'b']
+        Length: 2, dtype: str32
 
-        rather than
+        This would instead return the new ExtensionArray dedicated for string
+        data. If you really need the new array to be backed by a  NumPy array,
+        specify that in the dtype.
 
-        >>> pd.array([1, 2, 3], dtype="int32")
-        array([1, 2, 3], dtype=int32)
-
-        If and when pandas switches to a different backend for storing arrays,
-        the meaning of the string aliases will change, while the actual
-        dtypes will be unambiguous.
+        >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
+        <PandasArray>
+        ['a', 'b']
+        Length: 2, dtype: str32
 
     copy : bool, default True
         Whether to copy the data, even if not necessary. Depending
@@ -80,7 +87,7 @@ def array(data,         # type: Sequence[object]
 
     Returns
     -------
-    array : Union[numpy.ndarray, ExtensionArray]
+    array : ExtensionArray
 
     Raises
     ------
@@ -109,12 +116,16 @@ def array(data,         # type: Sequence[object]
     :meth:`numpy.array`, and an ``ndarray`` is returned.
 
     >>> pd.array([1, 2])
-    array([1, 2])
+    <PandasArray>
+    [1, 2]
+    Length: 2, dtype: int64
 
     Or the NumPy dtype can be specified
 
     >>> pd.array([1, 2], dtype=np.dtype("int32"))
-    array([1, 2], dtype=int32)
+    <PandasArray>
+    [1, 2]
+    Length: 2, dtype: int32
 
     You can use the string alias for `dtype`
 
@@ -134,7 +145,9 @@ def array(data,         # type: Sequence[object]
     NumPy array.
 
     >>> pd.array([1, 2, np.nan])
-    array([ 1.,  2., nan])
+    <PandasArray>
+    [1.0,  2.0, nan]
+    Length: 3, dtype: float64
 
     To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
     the dtype:
@@ -159,7 +172,7 @@ def array(data,         # type: Sequence[object]
     ValueError: Cannot pass scalar '1' to 'pandas.array'.
     """
     from pandas.core.arrays import (
-        period_array, ExtensionArray, IntervalArray
+        period_array, ExtensionArray, IntervalArray, PandasArray
     )
 
     if lib.is_scalar(data):
@@ -202,4 +215,6 @@ def array(data,         # type: Sequence[object]
         # TODO(DatetimeArray): handle this type
         # TODO(BooleanArray): handle this type
 
-    return np.array(data, dtype=dtype, copy=copy)
+    result = np.array(data, dtype=dtype, copy=copy)
+    result = PandasArray(result)
+    return result
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
@@ -7,22 +7,22 @@
 
 import pandas as pd
 from pandas.api.extensions import register_extension_dtype
-from pandas.core.arrays import integer_array, period_array
+from pandas.core.arrays import PandasArray, integer_array, period_array
 from pandas.tests.extension.decimal import (
     DecimalArray, DecimalDtype, to_decimal)
 import pandas.util.testing as tm
 
 
 @pytest.mark.parametrize("data, dtype, expected", [
     # Basic NumPy defaults.
-    ([1, 2], None, np.array([1, 2])),
-    ([1, 2], object, np.array([1, 2], dtype=object)),
+    ([1, 2], None, PandasArray(np.array([1, 2]))),
+    ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
     ([1, 2], np.dtype('float32'),
-     np.array([1., 2.0], dtype=np.dtype('float32'))),
-    (np.array([1, 2]), None, np.array([1, 2])),
+     PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))),
+    (np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
 
     # String alias passes through to NumPy
-    ([1, 2], 'float32', np.array([1, 2], dtype='float32')),
+    ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))),
 
     # Period alias
     ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]',
@@ -34,7 +34,7 @@
 
     # Datetime (naive)
     ([1, 2], np.dtype('datetime64[ns]'),
-     np.array([1, 2], dtype='datetime64[ns]')),
+     PandasArray(np.array([1, 2], dtype='datetime64[ns]'))),
     # TODO(DatetimeArray): add here
 
     # Category
@@ -51,10 +51,10 @@
 
     # IntegerNA
     ([1, None], 'Int16', integer_array([1, None], dtype='Int16')),
-    (pd.Series([1, 2]), None, np.array([1, 2], dtype=np.int64)),
+    (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
 
     # Index
-    (pd.Index([1, 2]), None, np.array([1, 2], dtype=np.int64)),
+    (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
 
     # Series[EA] returns the EA
     (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])),
@@ -64,10 +64,6 @@
     # "3rd party" EAs work
     ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])),
 
-    # 2D ndarrays pass through
-    (np.array([[1, 2], [3, 4]]), None, np.array([[1, 2], [3, 4]])),
-    ([[1, 2], [3, 4]], None, np.array([[1, 2, ], [3, 4]])),
-
     # pass an ExtensionArray, but a different dtype
     (period_array(['2000', '2001'], freq='D'),
      'category',
@@ -82,15 +78,15 @@ def test_array_copy():
     a = np.array([1, 2])
     # default is to copy
     b = pd.array(a)
-    assert np.shares_memory(a, b) is False
+    assert np.shares_memory(a, b._ndarray) is False
 
     # copy=True
     b = pd.array(a, copy=True)
-    assert np.shares_memory(a, b) is False
+    assert np.shares_memory(a, b._ndarray) is False
 
     # copy=False
     b = pd.array(a, copy=False)
-    assert a is b
+    assert np.shares_memory(a, b._ndarray) is True
 
 
 @pytest.mark.parametrize('data, expected', [
@@ -112,10 +108,24 @@ def test_array_inference(data, expected):
 ])
 def test_array_inference_fails(data):
     result = pd.array(data)
-    expected = np.array(data, dtype=object)
-    tm.assert_numpy_array_equal(result, expected)
+    expected = PandasArray(np.array(data, dtype=object))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [
+    np.array([[1, 2], [3, 4]]),
+    [[1, 2], [3, 4]],
+])
+def test_nd_raises(data):
+    with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'):
+        pd.array(data)
 
 
+def test_scalar_raises():
+    with pytest.raises(ValueError,
+                       match="Cannot pass scalar '1'"):
+        pd.array(1)
+
 # ---------------------------------------------------------------------------
 # A couple dummy classes to ensure that Series and Indexes are unboxed before
 # getting to the EA classes.
@@ -169,9 +179,3 @@ def test_array_not_registered(registry_without_decimal):
     result = pd.array(data, dtype=DecimalDtype)
     expected = DecimalArray._from_sequence(data)
     tm.assert_equal(result, expected)
-
-
-def test_scalar_raises():
-    with pytest.raises(ValueError,
-                       match="Cannot pass scalar '1'"):
-        pd.array(1)