From 1c4b3697b9dfd58b45b0e5d28936ea316daf8750 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Aug 2024 13:40:01 -0700 Subject: [PATCH 1/2] API (string): return str dtype for .dt methods, DatetimeIndex methods --- pandas/core/arrays/datetimelike.py | 5 ++++ pandas/core/arrays/datetimes.py | 16 +++++++++++++ pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/extension.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 24 ++++++++++++------- pandas/tests/io/excel/test_writers.py | 1 - .../series/accessors/test_dt_accessor.py | 8 +++---- 7 files changed, 43 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c5a08c1834a3e..6de3e5783c1c7 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -19,6 +19,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4c275bc48fc6a..50f86f315b3af 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -15,6 +15,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -1332,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1393,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) return result @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 412ef8a4b1e51..3b3cda8f7cd33 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -263,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 48d5e59250f35..2eeacfb769be4 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -74,7 +74,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 5834b268be2be..59ff4f3122e8f 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -891,20 +891,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -1161,20 +1165,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e1cdfb8bfa7e3..44266ae9a62a5 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -282,7 +282,6 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_excel_parse_dates(self, tmp_excel): # see gh-11544, gh-12051 df = DataFrame( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index f483eae3a9bd9..9b9a8ea3600ae 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -27,6 +27,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -513,7 +514,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -584,10 +584,9 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("str") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -620,7 +619,6 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ @@ -643,7 +641,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): From 00ae358e9fe553f98caa3d38c3f8d786c8d5f635 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Aug 2024 13:59:32 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/datetimes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6de3e5783c1c7..fbe1677b95b33 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1763,7 +1763,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: if using_string_dtype(): from pandas import StringDtype - return pd_array(result, dtype=StringDtype(na_value=np.nan)) + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 50f86f315b3af..201c449185057 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1339,7 +1339,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: array as pd_array, ) - return pd_array(result, dtype=StringDtype(na_value=np.nan)) + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1408,7 +1408,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: array as pd_array, ) - return pd_array(result, dtype=StringDtype(na_value=np.nan)) + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property