API: Standard signature for to_numpy (pandas-dev#24341)

TomAugspurger · TomAugspurger · commit 1078b251c288 · 2018-12-20T09:10:29.000-06:00
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -87,6 +87,27 @@ When the Series or Index is backed by
 an :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy`
 may involve copying data and coercing values. See :ref:`basics.dtypes` for more.
 
+:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the
+resulting :class:`ndarray`. For example, consider datetimes with timezones.
+NumPy doesn't have a dtype to represent timezone-aware datetimes, so there
+are two possibly useful representations:
+
+1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each
+   with the correct ``tz``
+2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have
+   been converted to UTC and the timezone discarded
+
+Timezones may be preserved with ``dtype=object``
+
+.. ipython:: python
+
+   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+   ser.to_numpy(dtype=object)
+
+Or thrown away with ``dtype='datetime64[ns]'``
+
+   ser.to_numpy(dtype="datetime64[ns]")
+
 Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more
 complex. When your ``DataFrame`` only has a single data type for all the
 columns, :meth:`DataFrame.to_numpy` will return the underlying data:
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -857,18 +857,22 @@ def array(self):
             result = PandasArray(result)
         return result
 
-    def to_numpy(self):
+    def to_numpy(self, dtype=None, copy=False):
         """
         A NumPy ndarray representing the values in this Series or Index.
 
         .. versionadded:: 0.24.0
 
-        The returned array will be the same up to equality (values equal
-        in `self` will be equal in the returned array; likewise for values
-        that are not equal). When `self` contains an ExtensionArray, the
-        dtype may be different. For example, for a category-dtype Series,
-        ``to_numpy()`` will return a NumPy array and the categorical dtype
-        will be lost.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`
+        copy : bool, default False
+            Whether to ensure that the returned value is a not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
 
         Returns
         -------
@@ -882,10 +886,18 @@ def to_numpy(self):
 
         Notes
         -----
+        The returned array will be the same up to equality (values equal
+        in `self` will be equal in the returned array; likewise for values
+        that are not equal). When `self` contains an ExtensionArray, the
+        dtype may be different. For example, for a category-dtype Series,
+        ``to_numpy()`` will return a NumPy array and the categorical dtype
+        will be lost.
+
+
         For NumPy dtypes, this will be a reference to the actual data stored
-        in this Series or Index. Modifying the result in place will modify
-        the data stored in the Series or Index (not that we recommend doing
-        that).
+        in this Series or Index (assuming ``copy=False``). Modifying the result
+        in place will modify the data stored in the Series or Index (not that
+        we recommend doing that).
 
         For extension types, ``to_numpy()`` *may* require copying data and
         coercing the result to a NumPy type (possibly object), which may be
@@ -910,12 +922,37 @@ def to_numpy(self):
         >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
         >>> ser.to_numpy()
         array(['a', 'b', 'a'], dtype=object)
+
+        Specify the `dtype` to control how datetime-aware data is represented.
+        Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
+        objects, each with the correct ``tz``.
+
+        >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+        >>> ser.to_numpy(dtype=object)
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
+               Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
+              dtype=object)
+
+        Or ``dtype='datetime64[ns]'`` to return an ndarray of native
+        datetime64 values. The values are converted to UTC and the timezone
+        info is dropped.
+
+        >>> ser.to_numpy(dtype="datetime64[ns]")
+        ... # doctest: +ELLIPSIS
+        array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+              dtype='datetime64[ns]')
         """
         if (is_extension_array_dtype(self.dtype) or
                 is_datetime64tz_dtype(self.dtype)):
             # TODO(DatetimeArray): remove the second clause.
-            return np.asarray(self._values)
-        return self._values
+            # TODO(GH-24345): Avoid potential double copy
+            result = np.asarray(self._values, dtype=dtype)
+        else:
+            result = self._values
+
+        if copy:
+            result = result.copy()
+        return result
 
     @property
     def _ndarray_values(self):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1126,17 +1126,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
 
         return cls(data, index=index, columns=columns, dtype=dtype)
 
-    def to_numpy(self):
+    def to_numpy(self, dtype=None, copy=False):
         """
         Convert the DataFrame to a NumPy array.
 
         .. versionadded:: 0.24.0
 
-        The dtype of the returned array will be the common NumPy
-        dtype of all types in the DataFrame. For example,
-        if the dtypes are ``float16`` and ``float32``, the results
-        dtype will be ``float32``. This may require copying data and
-        coercing values, which may be expensive.
+        By default, the dtype of the returned array will be the common NumPy
+        dtype of all types in the DataFrame. For example, if the dtypes are
+        ``float16`` and ``float32``, the results dtype will be ``float32``.
+        This may require copying data and coercing values, which may be
+        expensive.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`
+        copy : bool, default False
+            Whether to ensure that the returned value is a not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
 
         Returns
         -------
@@ -1168,7 +1178,8 @@ def to_numpy(self):
         array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
                [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
         """
-        return self.values
+        result = np.array(self.values, dtype=dtype, copy=copy)
+        return result
 
     def to_dict(self, orient='dict', into=dict):
         """
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -325,6 +325,19 @@ def test_to_numpy(self):
         result = df.to_numpy()
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_to_numpy_dtype(self):
+        df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
+        expected = np.array([[1, 3], [2, 4]], dtype="int64")
+        result = df.to_numpy(dtype="int64")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_to_numpy_copy(self):
+        arr = np.random.randn(4, 3)
+        df = pd.DataFrame(arr)
+        assert df.values.base is arr
+        assert df.to_numpy(copy=False).base is arr
+        assert df.to_numpy(copy=True).base is None
+
     def test_transpose(self, float_frame):
         frame = float_frame
         dft = frame.T
diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py
@@ -17,6 +17,12 @@ def test_tolist(idx):
     assert result == exp
 
 
+def test_to_numpy(idx):
+    result = idx.to_numpy()
+    exp = idx.values
+    tm.assert_numpy_array_equal(result, exp)
+
+
 def test_to_frame():
     tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
 
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -1318,3 +1318,43 @@ def test_to_numpy(array, expected, box):
 
     result = thing.to_numpy()
     tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("as_series", [True, False])
+@pytest.mark.parametrize("arr", [
+    np.array([1, 2, 3], dtype="int64"),
+    np.array(['a', 'b', 'c'], dtype=object),
+])
+def test_to_numpy_copy(arr, as_series):
+    obj = pd.Index(arr, copy=False)
+    if as_series:
+        obj = pd.Series(obj.values, copy=False)
+
+    # no copy by default
+    result = obj.to_numpy()
+    assert np.shares_memory(arr, result) is True
+
+    result = obj.to_numpy(copy=False)
+    assert np.shares_memory(arr, result) is True
+
+    # copy=True
+    result = obj.to_numpy(copy=True)
+    assert np.shares_memory(arr, result) is False
+
+
+@pytest.mark.parametrize("as_series", [True, False])
+def test_to_numpy_dtype(as_series):
+    tz = "US/Eastern"
+    obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
+    if as_series:
+        obj = pd.Series(obj)
+    result = obj.to_numpy(dtype=object)
+    expected = np.array([pd.Timestamp('2000', tz=tz),
+                         pd.Timestamp('2001', tz=tz)],
+                        dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+    result = obj.to_numpy()
+    expected = np.array(['2000-01-01T05', '2001-01-01T05'],
+                        dtype='M8[ns]')
+    tm.assert_numpy_array_equal(result, expected)