From 64f9690ceaa1684f32b1ef69c4df9e160fb549bd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Aug 2021 16:23:23 +0200 Subject: [PATCH 1/6] PERF: improve perf of Series fastpath constructor --- asv_bench/benchmarks/series_methods.py | 20 +++++++------ pandas/core/series.py | 40 ++++++++++++++------------ 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 7592ce54e3712..9e4c78a48b906 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,6 +3,7 @@ import numpy as np from pandas import ( + Index, NaT, Series, date_range, @@ -12,20 +13,23 @@ class SeriesConstructor: - - params = [None, "dict"] - param_names = ["data"] - - def setup(self, data): + def setup(self): self.idx = date_range( start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" ) - dict_data = dict(zip(self.idx, range(len(self.idx)))) - self.data = None if data is None else dict_data + self.data = dict(zip(self.idx, range(len(self.idx)))) + self.array = np.array([1, 2, 3]) + self.idx2 = Index(["a", "b", "c"]) - def time_constructor(self, data): + def time_constructor_dict(self): Series(data=self.data, index=self.idx) + def time_constructor_no_data(self): + Series(data=None, index=self.idx) + + def time_constructor_fastpath(self): + Series(self.array, index=self.idx, name="name", fastpath=True) + class NSort: diff --git a/pandas/core/series.py b/pandas/core/series.py index 6efd1f65c2264..29d9bb7db4197 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -441,7 +441,11 @@ def __init__( data = SingleArrayManager.from_array(data, index) generic.NDFrame.__init__(self, data) - self.name = name + if fastpath: + # skips validation of the name + object.__setattr__(self, "_name", name) + else: + self.name = name self._set_axis(0, index, fastpath=True) def _init_dict(self, data, index=None, dtype: Dtype | None = None): @@ -531,23 +535,23 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: if not fastpath: labels = ensure_index(labels) - if labels._is_all_dates: - deep_labels = labels - if isinstance(labels, CategoricalIndex): - deep_labels = labels.categories - - if not isinstance( - deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ): - try: - labels = DatetimeIndex(labels) - # need to set here because we changed the index - if fastpath: - self._mgr.set_axis(axis, labels) - except (tslibs.OutOfBoundsDatetime, ValueError): - # labels may exceeds datetime bounds, - # or not be a DatetimeIndex - pass + if labels._is_all_dates: + deep_labels = labels + if isinstance(labels, CategoricalIndex): + deep_labels = labels.categories + + if not isinstance( + deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) + ): + try: + labels = DatetimeIndex(labels) + # need to set here because we changed the index + if fastpath: + self._mgr.set_axis(axis, labels) + except (tslibs.OutOfBoundsDatetime, ValueError): + # labels may exceeds datetime bounds, + # or not be a DatetimeIndex + pass object.__setattr__(self, "_index", labels) if not fastpath: From 57659b8c7a0bbb5b7ae7ca4d140641066b3d111b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Aug 2021 19:09:52 +0200 Subject: [PATCH 2/6] pass through the fastpath arg --- pandas/core/series.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 29d9bb7db4197..e31ba2c2d9d5f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -446,7 +446,7 @@ def __init__( object.__setattr__(self, "_name", name) else: self.name = name - self._set_axis(0, index, fastpath=True) + self._set_axis(0, index, fastpath=fastpath) def _init_dict(self, data, index=None, dtype: Dtype | None = None): """ @@ -545,9 +545,6 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: ): try: labels = DatetimeIndex(labels) - # need to set here because we changed the index - if fastpath: - self._mgr.set_axis(axis, labels) except (tslibs.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex From 972185f675da791a29f4cd0f514842f2a1076175 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Aug 2021 08:47:36 +0200 Subject: [PATCH 3/6] simplify --- pandas/core/series.py | 44 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0a600adb472ed..0c28071780293 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -446,7 +446,7 @@ def __init__( object.__setattr__(self, "_name", name) else: self.name = name - self._set_axis(0, index, fastpath=fastpath) + self._set_axis(0, index) def _init_dict(self, data, index=None, dtype: Dtype | None = None): """ @@ -523,34 +523,32 @@ def _constructor_expanddim(self) -> type[DataFrame]: def _can_hold_na(self) -> bool: return self._mgr._can_hold_na - def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: + def _set_axis(self, axis: int, labels) -> None: """ Override generic, we want to set the _typ here. This is called from the cython code when we set the `index` attribute directly, e.g. `series.index = [1, 2, 3]`. """ - if not fastpath: - labels = ensure_index(labels) - - if labels._is_all_dates: - deep_labels = labels - if isinstance(labels, CategoricalIndex): - deep_labels = labels.categories - - if not isinstance( - deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ): - try: - labels = DatetimeIndex(labels) - except (tslibs.OutOfBoundsDatetime, ValueError): - # labels may exceeds datetime bounds, - # or not be a DatetimeIndex - pass - - if not fastpath: - # The ensure_index call above ensures we have an Index object - self._mgr.set_axis(axis, labels) + labels = ensure_index(labels) + + if labels._is_all_dates: + deep_labels = labels + if isinstance(labels, CategoricalIndex): + deep_labels = labels.categories + + if not isinstance( + deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) + ): + try: + labels = DatetimeIndex(labels) + except (tslibs.OutOfBoundsDatetime, ValueError): + # labels may exceeds datetime bounds, + # or not be a DatetimeIndex + pass + + # The ensure_index call above ensures we have an Index object + self._mgr.set_axis(axis, labels) # ndarray compatibility @property From 33fde4f9ce902af8cd61fe6bebc3e1d09db72fb9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Nov 2021 22:12:29 +0100 Subject: [PATCH 4/6] TEMP --- asv_bench/benchmarks/series_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 203e23c42c988..d43314f0d461f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -28,7 +28,7 @@ def time_constructor_no_data(self): Series(data=None, index=self.idx) def time_constructor_fastpath(self): - Series(self.array, index=self.idx, name="name", fastpath=True) + Series(self.array, index=self.idx2, name="name", fastpath=True) class ToFrame: From 22cc921f7d1f5f90f075aea6c59c60a69496ade7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Dec 2021 10:20:43 +0100 Subject: [PATCH 5/6] add test and whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/tests/frame/indexing/test_indexing.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c73706878e856..f7b725cc2d70f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -698,6 +698,7 @@ Indexing - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) +- Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) - Missing diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 2194fb4d5b1bd..7846ec6045d1b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1526,3 +1526,18 @@ def test_loc_iloc_setitem_non_categorical_rhs( # "c" not part of the categories with pytest.raises(TypeError, match=msg1): indexer(df)[key] = ["c", "c"] + + @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc, tm.iloc]) + def test_getitem_preserve_object_index_with_dates(self, indexer): + # https://github.com/pandas-dev/pandas/pull/42950 - when selecting a column + # from dataframe, don't try to infer object dtype index on Series construction + idx = date_range("2012", periods=3).astype(object) + df = DataFrame({0: [1, 2, 3]}, index=idx) + assert df.index.dtype == object + + if indexer is tm.getitem: + ser = indexer(df)[0] + else: + ser = indexer(df)[:, 0] + + assert ser.index.dtype == object From daebcf1250777e12cfbc259b8e95f154e9355f68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 22 Jan 2022 09:15:15 +0100 Subject: [PATCH 6/6] move whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 - doc/source/whatsnew/v1.5.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0c58057be1809..363d4b57544a9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -900,7 +900,6 @@ Indexing - Bug in :meth:`DataFrame.loc.__setitem__` changing dtype when indexer was completely ``False`` (:issue:`37550`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) -- Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) - Fixed regression where a single column ``np.matrix`` was no longer coerced to a 1d ``np.ndarray`` when added to a :class:`DataFrame` (:issue:`42376`) - Bug in :meth:`Series.__getitem__` with a :class:`CategoricalIndex` of integers treating lists of integers as positional indexers, inconsistent with the behavior with a single scalar integer (:issue:`15470`, :issue:`14865`) - Bug in :meth:`Series.__setitem__` when setting floats or integers into integer-dtype :class:`Series` failing to upcast when necessary to retain precision (:issue:`45121`) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index fa19a49b7ff45..729d08f7d659b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -216,6 +216,7 @@ Indexing - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - Bug in :meth:`Series.__setitem__` where setting :attr:`NA` into a numeric-dtpye :class:`Series` would incorrectly upcast to object-dtype rather than treating the value as ``np.nan`` (:issue:`44199`) +- Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) - Missing