Merge branch 'main' into DEPR-to_datetime-mixed-offsets-with-utc=False

natmokval · natmokval · commit 0549e6d8d721 · 2023-07-25T21:42:45.000+02:00
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -244,7 +244,7 @@ a ``BaseIndexer`` subclass that allows a user to define a custom method for calc
 The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns
 a tuple of two arrays, the first being the starting indices of the windows and second being the
 ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed``
-and will automatically be passed to ``get_window_bounds`` and the defined method must
+and ``step`` will automatically be passed to ``get_window_bounds`` and the defined method must
 always accept these arguments.
 
 For example, if we have the following :class:`DataFrame`
@@ -259,33 +259,26 @@ For example, if we have the following :class:`DataFrame`
 and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size
 1, we can create the following ``BaseIndexer`` subclass:
 
-.. code-block:: ipython
-
-   In [2]: from pandas.api.indexers import BaseIndexer
-
-   In [3]: class CustomIndexer(BaseIndexer):
-      ...:     def get_window_bounds(self, num_values, min_periods, center, closed):
-      ...:         start = np.empty(num_values, dtype=np.int64)
-      ...:         end = np.empty(num_values, dtype=np.int64)
-      ...:         for i in range(num_values):
-      ...:             if self.use_expanding[i]:
-      ...:                 start[i] = 0
-      ...:                 end[i] = i + 1
-      ...:             else:
-      ...:                 start[i] = i
-      ...:                 end[i] = i + self.window_size
-      ...:         return start, end
-
-   In [4]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
-
-   In [5]: df.rolling(indexer).sum()
-   Out[5]:
-       values
-   0     0.0
-   1     1.0
-   2     3.0
-   3     3.0
-   4    10.0
+.. ipython:: python
+
+   from pandas.api.indexers import BaseIndexer
+
+   class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = i + 1
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+            return start, end
+
+   indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+   df.rolling(indexer).sum()
 
 You can view other examples of ``BaseIndexer`` subclasses `here <https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexers/objects.py>`__
 
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -149,6 +149,7 @@ Other enhancements
 - Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
 - Classes that are useful for type-hinting have been added to the public API in the new submodule ``pandas.api.typing`` (:issue:`48577`)
 - Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.is_days_in_month`, :attr:`Series.dt.unit`, :meth:`Series.dt.is_normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`)
+- Implemented :func:`api.interchange.from_dataframe` for :class:`DatetimeTZDtype` (:issue:`54239`)
 - Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`)
 - Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
 - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
@@ -676,6 +677,7 @@ Other
 - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`)
 - Bug in :meth:`Index.sort_values` when a ``key`` is passed (:issue:`52764`)
 - Bug in :meth:`Series.align`, :meth:`DataFrame.align`, :meth:`Series.reindex`, :meth:`DataFrame.reindex`, :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, incorrectly failing to raise with method="asfreq" (:issue:`53620`)
+- Bug in :meth:`Series.argsort` failing to raise when an invalid ``axis`` is passed (:issue:`54257`)
 - Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`)
 - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
 - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1678,6 +1678,19 @@ def _reduce(
     # Non-Optimized Default Methods; in the case of the private methods here,
     #  these are not guaranteed to be stable across pandas versions.
 
+    def _values_for_json(self) -> np.ndarray:
+        """
+        Specify how to render our entries in to_json.
+
+        Notes
+        -----
+        The dtype on the returned ndarray is not restricted, but for non-native
+        types that are not specifically handled in objToJSON.c, to_json is
+        liable to raise. In these cases, it may be safer to return an ndarray
+        of strings.
+        """
+        return np.asarray(self)
+
     def _hash_pandas_object(
         self, *, encoding: str, hash_key: str, categorize: bool
     ) -> npt.NDArray[np.uint64]:
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -2202,6 +2202,12 @@ def _with_freq(self, freq) -> Self:
     # --------------------------------------------------------------
     # ExtensionArray Interface
 
+    def _values_for_json(self) -> np.ndarray:
+        # Small performance bump vs the base class which calls np.asarray(self)
+        if isinstance(self.dtype, np.dtype):
+            return self._ndarray
+        return super()._values_for_json()
+
     def factorize(
         self,
         use_na_sentinel: bool = True,
diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py
@@ -9,7 +9,10 @@
 from pandas.errors import NoBufferPresent
 from pandas.util._decorators import cache_readonly
 
-from pandas.core.dtypes.dtypes import ArrowDtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    DatetimeTZDtype,
+)
 
 import pandas as pd
 from pandas.api.types import is_string_dtype
@@ -138,6 +141,8 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
             raise ValueError(f"Data type {dtype} not supported by interchange protocol")
         if isinstance(dtype, ArrowDtype):
             byteorder = dtype.numpy_dtype.byteorder
+        elif isinstance(dtype, DatetimeTZDtype):
+            byteorder = dtype.base.byteorder  # type: ignore[union-attr]
         else:
             byteorder = dtype.byteorder
 
@@ -269,7 +274,13 @@ def _get_data_buffer(
             DtypeKind.BOOL,
             DtypeKind.DATETIME,
         ):
-            buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
+            # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make
+            # it longer than 4 characters
+            if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4:
+                np_arr = self._col.dt.tz_convert(None).to_numpy()
+            else:
+                np_arr = self._col.to_numpy()
+            buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == DtypeKind.CATEGORICAL:
             codes = self._col.values._codes
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
@@ -325,20 +325,20 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     return np.asarray(str_list, dtype="object"), buffers
 
 
-def parse_datetime_format_str(format_str, data):
+def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:
     """Parse datetime `format_str` to interpret the `data`."""
     # timestamp 'ts{unit}:tz'
     timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
     if timestamp_meta:
         unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
-        if tz != "":
-            raise NotImplementedError("Timezones are not supported yet")
         if unit != "s":
             # the format string describes only a first letter of the unit, so
             # add one extra letter to convert the unit to numpy-style:
             # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
             unit += "s"
         data = data.astype(f"datetime64[{unit}]")
+        if tz != "":
+            data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz)
         return data
 
     # date 'td{Days/Ms}'
@@ -358,7 +358,7 @@ def parse_datetime_format_str(format_str, data):
     raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
 
 
-def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]:
     """
     Convert a column holding DateTime data to a NumPy array.
 
@@ -389,7 +389,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
         length=col.size(),
     )
 
-    data = parse_datetime_format_str(format_str, data)
+    data = parse_datetime_format_str(format_str, data)  # type: ignore[assignment]
     data = set_nulls(data, col, buffers["validity"])
     return data, buffers
 
diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py
@@ -4,7 +4,6 @@
 
 from __future__ import annotations
 
-import re
 import typing
 
 import numpy as np
@@ -14,6 +13,7 @@
 from pandas.core.dtypes.dtypes import (
     ArrowDtype,
     CategoricalDtype,
+    DatetimeTZDtype,
 )
 
 if typing.TYPE_CHECKING:
@@ -134,10 +134,13 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
 
     if lib.is_np_dtype(dtype, "M"):
         # Selecting the first char of resolution string:
-        # dtype.str -> '<M8[ns]'
-        resolution = re.findall(r"\[(.*)\]", dtype.str)[0][:1]
+        # dtype.str -> '<M8[ns]' -> 'n'
+        resolution = np.datetime_data(dtype)[0][0]
         return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
 
+    elif isinstance(dtype, DatetimeTZDtype):
+        return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
+
     raise NotImplementedError(
         f"Conversion of {dtype} to Arrow C format string is not implemented."
     )
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1638,9 +1638,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         """
         raise AbstractMethodError(self)
 
-    def values_for_json(self) -> np.ndarray:
-        raise AbstractMethodError(self)
-
 
 class EABackedBlock(Block):
     """
@@ -1885,9 +1882,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         # TODO(EA2D): reshape not needed with 2D EAs
         return np.asarray(values).reshape(self.shape)
 
-    def values_for_json(self) -> np.ndarray:
-        return np.asarray(self.values)
-
     @final
     def pad_or_backfill(
         self,
@@ -2174,9 +2168,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
             return self.values.astype(_dtype_obj)
         return self.values
 
-    def values_for_json(self) -> np.ndarray:
-        return self.values
-
     @cache_readonly
     def is_numeric(self) -> bool:  # type: ignore[override]
         dtype = self.values.dtype
@@ -2231,9 +2222,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
     is_numeric = False
     values: DatetimeArray | TimedeltaArray
 
-    def values_for_json(self) -> np.ndarray:
-        return self.values._ndarray
-
 
 class DatetimeTZBlock(DatetimeLikeBlock):
     """implement a datetime64 block with a tz attribute"""
@@ -2242,10 +2230,6 @@ class DatetimeTZBlock(DatetimeLikeBlock):
 
     __slots__ = ()
 
-    # Don't use values_for_json from DatetimeLikeBlock since it is
-    # an invalid optimization here(drop the tz)
-    values_for_json = NDArrayBackedExtensionBlock.values_for_json
-
 
 # -----------------------------------------------------------------
 # Constructor Helpers
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1008,7 +1008,7 @@ def column_arrays(self) -> list[np.ndarray]:
 
         for blk in self.blocks:
             mgr_locs = blk._mgr_locs
-            values = blk.values_for_json()
+            values = blk.array_values._values_for_json()
             if values.ndim == 1:
                 # TODO(EA2D): special casing not needed with 2D EAs
                 result[mgr_locs[0]] = values
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3958,6 +3958,10 @@ def argsort(
         2    0
         dtype: int64
         """
+        if axis != -1:
+            # GH#54257 We allow -1 here so that np.argsort(series) works
+            self._get_axis_number(axis)
+
         values = self._values
         mask = isna(values)
 
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -284,3 +284,14 @@ def test_empty_pyarrow(data):
     arrow_df = pa_from_dataframe(expected)
     result = from_dataframe(arrow_df)
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("tz", ["UTC", "US/Pacific"])
+@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
+def test_datetimetzdtype(tz, unit):
+    # GH 54239
+    tz_data = (
+        pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit)
+    )
+    df = pd.DataFrame({"ts_tz": tz_data})
+    tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -2237,3 +2237,19 @@ def test_parse_dates_arrow_engine(all_parsers):
         }
     )
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_from_csv_with_mixed_offsets(all_parsers):
+    parser = all_parsers
+    data = "a\n2020-01-01T00:00:00+01:00\n2020-01-01T00:00:00+00:00"
+    result = parser.read_csv(StringIO(data), parse_dates=["a"])["a"]
+    expected = Series(
+        [
+            Timestamp("2020-01-01 00:00:00+01:00"),
+            Timestamp("2020-01-01 00:00:00+00:00"),
+        ],
+        name="a",
+        index=[0, 1],
+    )
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py
@@ -10,12 +10,20 @@
 
 
 class TestSeriesArgsort:
+    def test_argsort_axis(self):
+        # GH#54257
+        ser = Series(range(3))
+
+        msg = "No axis named 2 for object type Series"
+        with pytest.raises(ValueError, match=msg):
+            ser.argsort(axis=2)
+
     def test_argsort_numpy(self, datetime_series):
         ser = datetime_series
-        func = np.argsort
-        tm.assert_numpy_array_equal(
-            func(ser).values, func(np.array(ser)), check_dtype=False
-        )
+
+        res = np.argsort(ser).values
+        expected = np.argsort(np.array(ser))
+        tm.assert_numpy_array_equal(res, expected)
 
         # with missing values
         ts = ser.copy()
@@ -25,10 +33,10 @@ def test_argsort_numpy(self, datetime_series):
         with tm.assert_produces_warning(
             FutureWarning, match=msg, check_stacklevel=False
         ):
-            result = func(ts)[1::2]
-        expected = func(np.array(ts.dropna()))
+            result = np.argsort(ts)[1::2]
+        expected = np.argsort(np.array(ts.dropna()))
 
-        tm.assert_numpy_array_equal(result.values, expected, check_dtype=False)
+        tm.assert_numpy_array_equal(result.values, expected)
 
     def test_argsort(self, datetime_series):
         argsorted = datetime_series.argsort()
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@
 # See https://github.com/scipy/scipy/pull/12940 for the AIX issue.
 requires = [
     "meson-python==0.13.1",
-    "meson[ninja]==1.0.1",
+    "meson==1.0.1",
     "wheel",
     "Cython>=0.29.33,<3",  # Note: sync with setup.py, environment.yml and asv.conf.json
     "oldest-supported-numpy>=2022.8.16",