From 39f1e8fd293d21212b52f2126e526ea636d30e07 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:17:52 +0800 Subject: [PATCH 01/42] ENH: Add 'end' option in resample's origin --- pandas/tests/resample/test_resample_api.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 29f2aea1648ec..1bd3ecd6d2366 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -611,3 +611,17 @@ def test_resample_agg_readonly(): result = rs.agg("min") tm.assert_series_equal(result, expected) + + +def test_resample_end_origin(): + # GH#37804 + idx = pd.date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s') + data = np.ones(len(idx)) + s = pd.Series(data, index=idx) + result = s.resample('7min', origin='end', closed='right').sum() + + exp_idx = pd.date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T') + exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.] + expected = pd.Series(exp_data, index=exp_idx) + + tm.assert_series_equal(result, expected) From cd5aa643dc5f2aa05d17dfe0f8e08a694c0a9e4d Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:20:44 +0800 Subject: [PATCH 02/42] Update resample.py --- pandas/core/resample.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fccedd75c4531..f6f1ae3524df7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1404,14 +1404,16 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in ("epoch", "start", "start_day"): + if origin in ("epoch", "start", "start_day", "end"): + if origin == "end" and self.closed == "left": + raise ValueError("'closed' has to be 'right' when 'origin' is 'end'.") self.origin = origin else: try: self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " + "'origin' should be equal to 'epoch', 'start', 'start_day' 'end' or " f"should be a Timestamp convertible type. Got '{origin}' instead." ) from e @@ -1846,6 +1848,10 @@ def _adjust_dates_anchored( origin_nanos = first.value elif isinstance(origin, Timestamp): origin_nanos = origin.value + elif origin == 'end': + sub_freq_times = (last.value - first.value) // freq.nanos + first = last - sub_freq_times * freq + origin_nanos = first.value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, From 0184b1deeacfc2cfd641dcc9adb5a8469200f4d5 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:24:17 +0800 Subject: [PATCH 03/42] Update resample.py --- pandas/core/resample.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f6f1ae3524df7..293fcc086952c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1413,8 +1413,9 @@ def __init__( self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day' 'end' or " - f"should be a Timestamp convertible type. Got '{origin}' instead." + "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' " + f"or should be a Timestamp convertible type. Got '{origin}' " + "instead." ) from e try: From ff35b6f33daf9aa19c517e44a49bf462c0a6855e Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:48:05 +0800 Subject: [PATCH 04/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 1bd3ecd6d2366..623d5d496f939 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -615,13 +615,13 @@ def test_resample_agg_readonly(): def test_resample_end_origin(): # GH#37804 - idx = pd.date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s') + idx = date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s') data = np.ones(len(idx)) - s = pd.Series(data, index=idx) + s = Series(data, index=idx) result = s.resample('7min', origin='end', closed='right').sum() - exp_idx = pd.date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T') + exp_idx = date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T') exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.] - expected = pd.Series(exp_data, index=exp_idx) + expected = Series(exp_data, index=exp_idx) tm.assert_series_equal(result, expected) From 8c4549edce77048bfb2c0eb1f39b39a02f30a35b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 12:58:58 +0800 Subject: [PATCH 05/42] Update resample.py --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 293fcc086952c..2b42fefa9247c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1849,7 +1849,7 @@ def _adjust_dates_anchored( origin_nanos = first.value elif isinstance(origin, Timestamp): origin_nanos = origin.value - elif origin == 'end': + elif origin == "end": sub_freq_times = (last.value - first.value) // freq.nanos first = last - sub_freq_times * freq origin_nanos = first.value From b835d1ae79e2e8090e5d25da4db58fd4dea42a7c Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 13:00:27 +0800 Subject: [PATCH 06/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 623d5d496f939..a7c4cc1696808 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -615,13 +615,13 @@ def test_resample_agg_readonly(): def test_resample_end_origin(): # GH#37804 - idx = date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s') + idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") data = np.ones(len(idx)) s = Series(data, index=idx) - result = s.resample('7min', origin='end', closed='right').sum() + result = s.resample("7min", origin="end", closed="right").sum() - exp_idx = date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T') - exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.] + exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T") + exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6] expected = Series(exp_data, index=exp_idx) tm.assert_series_equal(result, expected) From bf15c67842c3b5c479ddee4f002245a77a600609 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 16:02:28 +0800 Subject: [PATCH 07/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a7c4cc1696808..aeaf4d099eacb 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -621,7 +621,7 @@ def test_resample_end_origin(): result = s.resample("7min", origin="end", closed="right").sum() exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T") - exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6] + exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] expected = Series(exp_data, index=exp_idx) tm.assert_series_equal(result, expected) From e4b01d87f4ad8897b3d4629f0dc4b8f28b013b79 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Fri, 13 Nov 2020 16:03:58 +0800 Subject: [PATCH 08/42] Update test_datetime_index.py --- pandas/tests/resample/test_datetime_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index d3d33d6fe847e..c94a687addb47 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -770,8 +770,8 @@ def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) msg = ( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " - f"should be a Timestamp convertible type. Got '{origin}' instead." + "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' " + f"or should be a Timestamp convertible type. Got '{origin}' instead." ) with pytest.raises(ValueError, match=msg): ts.resample("5min", origin=origin) From d096ccd1e9fe1137d20344bda6a3bd67197998b2 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 16:42:14 +0800 Subject: [PATCH 09/42] add backward para and end_day option --- pandas/core/generic.py | 2 + pandas/core/resample.py | 62 ++++++++++++++------ pandas/tests/resample/test_datetime_index.py | 2 +- pandas/tests/resample/test_resample_api.py | 2 +- 4 files changed, 49 insertions(+), 19 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 24c1ae971686e..a24d0c780838b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7952,6 +7952,7 @@ def resample( on=None, level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", + backward: Optional[bool] = None, offset: Optional[TimedeltaConvertibleTypes] = None, ) -> Resampler: """ @@ -8337,6 +8338,7 @@ def resample( key=on, level=level, origin=origin, + backward=backward, offset=offset, ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2b42fefa9247c..5b74782315d99 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1366,6 +1366,7 @@ def __init__( convention: Optional[str] = None, base: Optional[int] = None, origin: Union[str, TimestampConvertibleTypes] = "start_day", + backward: Optional[bool] = None, offset: Optional[TimedeltaConvertibleTypes] = None, **kwargs, ): @@ -1389,9 +1390,15 @@ def __init__( label = "right" else: if closed is None: - closed = "left" + if origin in ["end", "end_day"] or backward: + closed = "right" + else: + closed = "left" if label is None: - label = "left" + if origin in ["end", "end_day"] or backward: + label = "right" + else: + label = "left" self.closed = closed self.label = label @@ -1404,20 +1411,32 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in ("epoch", "start", "start_day", "end"): - if origin == "end" and self.closed == "left": - raise ValueError("'closed' has to be 'right' when 'origin' is 'end'.") + if origin in ("epoch", "start", "start_day", "end", "end_day"): self.origin = origin else: try: self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' " + "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' " f"or should be a Timestamp convertible type. Got '{origin}' " "instead." ) from e + if backward is None: + if self.origin in ("end", "end_day"): + self.backward = True + else: + self.backward = False + elif backward: + if origin in ("start", "start_day"): + raise ValueError(f"`start` or `start_day` origin isn't allowed when `backward` is True") + self.backward = backward + else: + if origin in ("end", "end_day"): + raise ValueError(f"`end` or `end_day` origin isn't allowed when `backward` is False") + self.backward = backward + try: self.offset = Timedelta(offset) if offset is not None else None except Exception as e: @@ -1505,6 +1524,7 @@ def _get_time_bins(self, ax): self.freq, closed=self.closed, origin=self.origin, + backward=self.backward, offset=self.offset, ) # GH #12037 @@ -1658,6 +1678,7 @@ def _get_period_bins(self, ax: PeriodIndex): self.freq, closed=self.closed, origin=self.origin, + backward=self.backward, offset=self.offset, ) @@ -1711,7 +1732,7 @@ def _take_new_index(obj, indexer, new_index, axis=0): def _get_timestamp_range_edges( - first, last, freq, closed="left", origin="start_day", offset=None + first, last, freq, closed="left", origin="start_day", backward=False, offset=None ): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on @@ -1764,7 +1785,7 @@ def _get_timestamp_range_edges( origin = origin.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, origin=origin, offset=offset + first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset ) if isinstance(freq, Day): first = first.tz_localize(index_tz) @@ -1784,7 +1805,7 @@ def _get_timestamp_range_edges( def _get_period_range_edges( - first, last, freq, closed="left", origin="start_day", offset=None + first, last, freq, closed="left", origin="start_day", backward=False, offset=None ): """ Adjust the provided `first` and `last` Periods to the respective Period of @@ -1826,7 +1847,7 @@ def _get_period_range_edges( adjust_last = freq.is_on_offset(last) first, last = _get_timestamp_range_edges( - first, last, freq, closed=closed, origin=origin, offset=offset + first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset ) first = (first + int(adjust_first) * freq).to_period(freq) @@ -1835,7 +1856,7 @@ def _get_period_range_edges( def _adjust_dates_anchored( - first, last, freq, closed="right", origin="start_day", offset=None + first, last, freq, closed="right", origin="start_day", backward=False, offset=None ): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is @@ -1847,12 +1868,19 @@ def _adjust_dates_anchored( origin_nanos = first.normalize().value elif origin == "start": origin_nanos = first.value - elif isinstance(origin, Timestamp): - origin_nanos = origin.value - elif origin == "end": - sub_freq_times = (last.value - first.value) // freq.nanos - first = last - sub_freq_times * freq - origin_nanos = first.value + elif isinstance(origin, Timestamp) or origin in ("end", "end_day"): + if backward: + if origin == "end": + origin = last + elif origin == "end_day": + origin = last.ceil('D') + sub_freq_times = (origin.value - first.value) // freq.nanos + if closed == "left": + sub_freq_times += 1 + first = origin - sub_freq_times * freq + origin_nanos = first.value + else: + origin_nanos = origin.value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c94a687addb47..156b24e0baf30 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -770,7 +770,7 @@ def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) msg = ( - "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' " + "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' " f"or should be a Timestamp convertible type. Got '{origin}' instead." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index aeaf4d099eacb..54f4c8189037e 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -620,7 +620,7 @@ def test_resample_end_origin(): s = Series(data, index=idx) result = s.resample("7min", origin="end", closed="right").sum() - exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T") + exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T") exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] expected = Series(exp_data, index=exp_idx) From 222ef8dd9ece689909ce0b5c3dfda426ff9f0959 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 17:42:29 +0800 Subject: [PATCH 10/42] add doc-string --- pandas/core/generic.py | 44 +++++++++++++++++++++++++++++++--- pandas/core/groupby/grouper.py | 30 ++++++++++++++++++++--- pandas/core/resample.py | 26 ++++++++++++++++---- 3 files changed, 89 insertions(+), 11 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a24d0c780838b..1f2d2d3228774 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7973,8 +7973,9 @@ def resample( `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. closed : {'right', 'left'}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', - 'BA', 'BQ', and 'W' which all have a default of 'right'. + for all frequency offsets with forward resampling except for 'M', + 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of + 'right'. When `Backward` set to be True, default is 'right'. label : {'right', 'left'}, default None Which bin edge label to label bucket with. The default is 'left' for all frequency offsets except for 'M', 'A', 'Q', 'BM', @@ -8007,7 +8008,7 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: @@ -8018,6 +8019,21 @@ def resample( .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.2.0 + + backward : bool, default is None + Resample on the given `origin` from a backward direction. True when + `origin` is 'end' or 'end_day'. False when `origin` is 'start' or + 'start_day'. Optional when using datetime `origin` , and default + False. The resample result for a specified datetime stands for the + group from time substract the given `freq` to time with a right + `closed` setting by default. + + .. versionadded:: 1.2.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -8297,6 +8313,28 @@ def resample( 2000-10-02 00:21:00 24 Freq: 17T, dtype: int64 + If you want to take the last timestamp as `origin` with a backward resample: + + >>> ts.index.max() + Timestamp('2000-10-02 00:26:00', freq='7T') + >>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum() + 2000-10-01 23:35:00 0 + 2000-10-01 23:52:00 18 + 2000-10-02 00:09:00 27 + 2000-10-02 00:26:00 63 + Freq: 17T, dtype: int32 + + You can also specify the backward origin: + + >>> ts.groupby(pd.Grouper(freq='17min', + origin='2000-10-02 00:30:00', + backward=True)).sum() + 2000-10-01 23:39:00 3 + 2000-10-01 23:56:00 15 + 2000-10-02 00:13:00 45 + 2000-10-02 00:30:00 45 + Freq: 17T, dtype: int32 + To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e8af9da30a298..7889c50ca3df7 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -83,9 +83,9 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' - The timestamp on which to adjust the grouping. The timezone of origin must - match the timezone of the index. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin + must match the timezone of the index. If a timestamp is not used, these values are also supported: - 'epoch': `origin` is 1970-01-01 @@ -94,6 +94,21 @@ class Grouper: .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.2.0 + + backward : bool, default is None + Resample on the given `origin` from a backward direction. True when + `origin` is 'end' or 'end_day'. False when `origin` is 'start' or + 'start_day'. Optional when using datetime `origin` , and default + False. The resample result for a specified datetime stands for the + group from time substract the given `freq` to time with a right + `closed` setting by default. + + .. versionadded:: 1.2.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -200,6 +215,15 @@ class Grouper: 2000-10-02 00:15:00 45 Freq: 17T, dtype: int64 + If you want to take the last timestamp as `origin` with a backward resample: + + >>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum() + 2000-10-01 23:39:00 0 + 2000-10-01 23:56:00 0 + 2000-10-02 00:13:00 3 + 2000-10-02 00:30:00 6 + Freq: 17T, dtype: int32 + If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5b74782315d99..3d7389b724819 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1430,11 +1430,15 @@ def __init__( self.backward = False elif backward: if origin in ("start", "start_day"): - raise ValueError(f"`start` or `start_day` origin isn't allowed when `backward` is True") + raise ValueError( + f"`start` or `start_day` origin isn't allowed when `backward` is True" + ) self.backward = backward else: if origin in ("end", "end_day"): - raise ValueError(f"`end` or `end_day` origin isn't allowed when `backward` is False") + raise ValueError( + f"`end` or `end_day` origin isn't allowed when `backward` is False" + ) self.backward = backward try: @@ -1785,7 +1789,13 @@ def _get_timestamp_range_edges( origin = origin.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset + first, + last, + freq, + closed=closed, + origin=origin, + backward=backward, + offset=offset, ) if isinstance(freq, Day): first = first.tz_localize(index_tz) @@ -1847,7 +1857,13 @@ def _get_period_range_edges( adjust_last = freq.is_on_offset(last) first, last = _get_timestamp_range_edges( - first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset + first, + last, + freq, + closed=closed, + origin=origin, + backward=backward, + offset=offset, ) first = (first + int(adjust_first) * freq).to_period(freq) @@ -1873,7 +1889,7 @@ def _adjust_dates_anchored( if origin == "end": origin = last elif origin == "end_day": - origin = last.ceil('D') + origin = last.ceil("D") sub_freq_times = (origin.value - first.value) // freq.nanos if closed == "left": sub_freq_times += 1 From 90c9c5f073d71341c93ba6994f390b53831aa1bd Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 18:40:44 +0800 Subject: [PATCH 11/42] add test cases --- pandas/core/resample.py | 2 +- pandas/tests/resample/test_resample_api.py | 115 ++++++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3d7389b724819..cb0d91e0e5398 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1438,7 +1438,7 @@ def __init__( if origin in ("end", "end_day"): raise ValueError( f"`end` or `end_day` origin isn't allowed when `backward` is False" - ) + ) self.backward = backward try: diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 54f4c8189037e..63b678b04a654 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -613,8 +613,121 @@ def test_resample_agg_readonly(): tm.assert_series_equal(result, expected) -def test_resample_end_origin(): +def test_backward_resample(): # GH#37804 + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + + # test consistency of backward and origin + msg = "`start` or `start_day` origin isn't allowed when `backward` is True" + with pytest.raises(ValueError, match=msg): + ts.resample("1min", origin="start", backward=True) + msg = "`end` or `end_day` origin isn't allowed when `backward` is False" + with pytest.raises(ValueError, match=msg): + ts.resample("1min", origin="end", backward=False) + + # test end origin + res = ts.resample("17min", origin="end").sum().astype("int64") + data = [0, 18, 27, 63] + expected = Series( + data, + index=date_range( + end="20001002 00:26:00", + freq="17min", + periods=4, + ) + ) + + tm.assert_series_equal(res, expected) + + # test end_day origin + # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29 + res = ts.resample("17min", origin="end_day").sum().astype("int64") + data = [3, 15, 45, 45] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:29:00", + freq="17min", + periods=4, + ) + ) + + tm.assert_series_equal(res, expected) + + # test datetime origin with backward resample + res = ts.resample( + "17min", + origin="2000-10-02 00:40:00", + backward=True, + ).sum().astype("int64") + data = [0, 9, 36, 39, 24] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:40:00", + freq="17min", + periods=5, + ) + ) + + tm.assert_series_equal(res, expected) + + res = ts.resample( + "17min", + origin="2000-10-02 01:05:00", + backward=True, + ).sum().astype("int64") + data = [3, 15, 45, 45] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:31:00", + freq="17min", + periods=4, + ) + ) + + tm.assert_series_equal(res, expected) + + # test right and left close + res = ts.resample( + "17min", + origin="end", + closed="right", + ).sum().astype("int64") + data = [0, 18, 27, 63] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:26:00 ", + freq="17min", + periods=4, + ) + ) + + tm.assert_series_equal(res, expected) + + res = ts.resample( + "17min", + origin="end", + closed="left", + ).sum().astype("int64") + data = [0, 18, 27, 39, 24] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:43:00", + freq="17min", + periods=5, + ) + ) + + tm.assert_series_equal(res, expected) + + # original test case idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") data = np.ones(len(idx)) s = Series(data, index=idx) From eae898ccf44068bc163769b54f31608f4351a91b Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 18:57:22 +0800 Subject: [PATCH 12/42] fix format --- pandas/core/generic.py | 3 +- pandas/core/groupby/grouper.py | 3 +- pandas/core/resample.py | 8 ++++-- pandas/tests/resample/test_resample_api.py | 32 +++++++++++----------- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1f2d2d3228774..98fef283eb8d9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8008,7 +8008,8 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day' + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7889c50ca3df7..b040fe993100b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -83,7 +83,8 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day' + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index cb0d91e0e5398..58cafb5d151de 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1418,8 +1418,9 @@ def __init__( self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' " - f"or should be a Timestamp convertible type. Got '{origin}' " + "'origin' should be equal to 'epoch', 'start', 'start_day'," + " 'end', 'end_day' or should be a Timestamp convertible" + f" type. Got '{origin}' " "instead." ) from e @@ -1431,7 +1432,8 @@ def __init__( elif backward: if origin in ("start", "start_day"): raise ValueError( - f"`start` or `start_day` origin isn't allowed when `backward` is True" + f"`start` or `start_day` origin isn't allowed when " + "`backward` is True" ) self.backward = backward else: diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 63b678b04a654..0cf081c3adb96 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -659,10 +659,10 @@ def test_backward_resample(): # test datetime origin with backward resample res = ts.resample( - "17min", - origin="2000-10-02 00:40:00", - backward=True, - ).sum().astype("int64") + "17min", + origin="2000-10-02 00:40:00", + backward=True, + ).sum().astype("int64") data = [0, 9, 36, 39, 24] expected = Series( data, @@ -676,10 +676,10 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) res = ts.resample( - "17min", - origin="2000-10-02 01:05:00", - backward=True, - ).sum().astype("int64") + "17min", + origin="2000-10-02 01:05:00", + backward=True, + ).sum().astype("int64") data = [3, 15, 45, 45] expected = Series( data, @@ -694,10 +694,10 @@ def test_backward_resample(): # test right and left close res = ts.resample( - "17min", - origin="end", - closed="right", - ).sum().astype("int64") + "17min", + origin="end", + closed="right", + ).sum().astype("int64") data = [0, 18, 27, 63] expected = Series( data, @@ -711,10 +711,10 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) res = ts.resample( - "17min", - origin="end", - closed="left", - ).sum().astype("int64") + "17min", + origin="end", + closed="left", + ).sum().astype("int64") data = [0, 18, 27, 39, 24] expected = Series( data, From 2ee1000d7144481354b60bb1b70470424b2936f0 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 18:59:13 +0800 Subject: [PATCH 13/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 0cf081c3adb96..6d2bfaf0ac181 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -637,7 +637,7 @@ def test_backward_resample(): end="20001002 00:26:00", freq="17min", periods=4, - ) + ), ) tm.assert_series_equal(res, expected) @@ -652,7 +652,7 @@ def test_backward_resample(): end="2000-10-02 00:29:00", freq="17min", periods=4, - ) + ), ) tm.assert_series_equal(res, expected) @@ -670,7 +670,7 @@ def test_backward_resample(): end="2000-10-02 00:40:00", freq="17min", periods=5, - ) + ), ) tm.assert_series_equal(res, expected) @@ -687,7 +687,7 @@ def test_backward_resample(): end="2000-10-02 00:31:00", freq="17min", periods=4, - ) + ), ) tm.assert_series_equal(res, expected) @@ -705,7 +705,7 @@ def test_backward_resample(): end="2000-10-02 00:26:00 ", freq="17min", periods=4, - ) + ), ) tm.assert_series_equal(res, expected) @@ -722,7 +722,7 @@ def test_backward_resample(): end="2000-10-02 00:43:00", freq="17min", periods=5, - ) + ), ) tm.assert_series_equal(res, expected) From 3442e007af697442d333c29b1c9bc5b4a54fd957 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 19:08:46 +0800 Subject: [PATCH 14/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 6d2bfaf0ac181..17229684d015a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -710,11 +710,15 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) - res = ts.resample( - "17min", - origin="end", - closed="left", - ).sum().astype("int64") + res = ( + ts.resample( + "17min", + origin="end", + closed="left", + ) + .sum() + .astype("int64") + ) data = [0, 18, 27, 39, 24] expected = Series( data, From a33acacf24afd1d8c3e02a3a45d55f5a63b61e92 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 19:17:00 +0800 Subject: [PATCH 15/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 17229684d015a..f6e8c56b5f332 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -693,11 +693,15 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) # test right and left close - res = ts.resample( - "17min", - origin="end", - closed="right", - ).sum().astype("int64") + res = ( + ts.resample( + "17min", + origin="end", + closed="right", + ) + .sum() + .astype("int64") + ) data = [0, 18, 27, 63] expected = Series( data, From 7c5483994be5543c9bff2615dfeb252851dd3548 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 19:33:22 +0800 Subject: [PATCH 16/42] Update test_resample_api.py --- pandas/tests/resample/test_resample_api.py | 28 ++++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index f6e8c56b5f332..6b000cae7c1dc 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -658,11 +658,15 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) # test datetime origin with backward resample - res = ts.resample( - "17min", - origin="2000-10-02 00:40:00", - backward=True, - ).sum().astype("int64") + res = ( + ts.resample( + "17min", + origin="2000-10-02 00:40:00", + backward=True, + ) + .sum() + .astype("int64") + ) data = [0, 9, 36, 39, 24] expected = Series( data, @@ -675,11 +679,15 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) - res = ts.resample( - "17min", - origin="2000-10-02 01:05:00", - backward=True, - ).sum().astype("int64") + res = ( + ts.resample( + "17min", + origin="2000-10-02 01:05:00", + backward=True, + ) + .sum() + .astype("int64") + ) data = [3, 15, 45, 45] expected = Series( data, From a4e0a3919e42d14fb3c9961f4dc86d9e02e8f664 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 21:39:11 +0800 Subject: [PATCH 17/42] flake8 fix --- pandas/core/resample.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 58cafb5d151de..53e565a966769 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1418,10 +1418,9 @@ def __init__( self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day'," - " 'end', 'end_day' or should be a Timestamp convertible" - f" type. Got '{origin}' " - "instead." + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end', 'end_day' or should be a Timestamp convertible " + f"type. Got '{origin}' instead." ) from e if backward is None: @@ -1432,14 +1431,14 @@ def __init__( elif backward: if origin in ("start", "start_day"): raise ValueError( - f"`start` or `start_day` origin isn't allowed when " + "`start` or `start_day` origin isn't allowed when " "`backward` is True" ) self.backward = backward else: if origin in ("end", "end_day"): raise ValueError( - f"`end` or `end_day` origin isn't allowed when `backward` is False" + "`end` or `end_day` origin isn't allowed when `backward` is False" ) self.backward = backward From 0e2e390a020f872d359689dba67ab91f5f4cb660 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 22:50:08 +0800 Subject: [PATCH 18/42] break lines --- pandas/core/generic.py | 4 ++-- pandas/core/groupby/grouper.py | 2 +- pandas/core/resample.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 98fef283eb8d9..82e948bb39bdb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7952,7 +7952,7 @@ def resample( on=None, level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", - backward: Optional[bool] = None, + backward: Optional[bool_t] = None, offset: Optional[TimedeltaConvertibleTypes] = None, ) -> Resampler: """ @@ -8008,7 +8008,7 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b040fe993100b..5dbd8d92e7cbb 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -83,7 +83,7 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 53e565a966769..8b391d087ce6f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1366,7 +1366,7 @@ def __init__( convention: Optional[str] = None, base: Optional[int] = None, origin: Union[str, TimestampConvertibleTypes] = "start_day", - backward: Optional[bool] = None, + backward: Optional[bool_t] = None, offset: Optional[TimedeltaConvertibleTypes] = None, **kwargs, ): From 9f4844a629622ca91669112361dedb0e63165285 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 27 Nov 2020 23:34:35 +0800 Subject: [PATCH 19/42] Update resample.py --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8b391d087ce6f..53e565a966769 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1366,7 +1366,7 @@ def __init__( convention: Optional[str] = None, base: Optional[int] = None, origin: Union[str, TimestampConvertibleTypes] = "start_day", - backward: Optional[bool_t] = None, + backward: Optional[bool] = None, offset: Optional[TimedeltaConvertibleTypes] = None, **kwargs, ): From 5b7f396d53ce8c73300d705da54170a4f321825a Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 28 Nov 2020 00:04:13 +0800 Subject: [PATCH 20/42] fix docstring --- pandas/core/generic.py | 2 +- pandas/core/groupby/grouper.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 82e948bb39bdb..a063648c3c215 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8323,7 +8323,7 @@ def resample( 2000-10-01 23:52:00 18 2000-10-02 00:09:00 27 2000-10-02 00:26:00 63 - Freq: 17T, dtype: int32 + Freq: 17T, dtype: int64 You can also specify the backward origin: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5dbd8d92e7cbb..261190747ee61 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -219,11 +219,11 @@ class Grouper: If you want to take the last timestamp as `origin` with a backward resample: >>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum() - 2000-10-01 23:39:00 0 - 2000-10-01 23:56:00 0 - 2000-10-02 00:13:00 3 - 2000-10-02 00:30:00 6 - Freq: 17T, dtype: int32 + 2000-10-01 23:35:00 0 + 2000-10-01 23:52:00 18 + 2000-10-02 00:09:00 27 + 2000-10-02 00:26:00 63 + Freq: 17T, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: From 115c92a3627b2ba002623eb6b10224725bfa94fb Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 28 Nov 2020 09:14:25 +0800 Subject: [PATCH 21/42] split tests --- pandas/core/generic.py | 4 +- pandas/tests/resample/test_resample_api.py | 52 +++++++++++++--------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a063648c3c215..4b1865b2a34d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8328,8 +8328,8 @@ def resample( You can also specify the backward origin: >>> ts.groupby(pd.Grouper(freq='17min', - origin='2000-10-02 00:30:00', - backward=True)).sum() + ... origin='2000-10-02 00:30:00', + ... backward=True)).sum() 2000-10-01 23:39:00 3 2000-10-01 23:56:00 15 2000-10-02 00:13:00 45 diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 6b000cae7c1dc..54f2beab25d8b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -613,14 +613,14 @@ def test_resample_agg_readonly(): tm.assert_series_equal(result, expected) -def test_backward_resample(): - # GH#37804 +# test data for backward resample GH#37804 +start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" +rng = date_range(start, end, freq="7min") +ts = Series(np.arange(len(rng)) * 3, index=rng) - start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) - # test consistency of backward and origin +def test_backward_origin_consistency(): + msg = "`start` or `start_day` origin isn't allowed when `backward` is True" with pytest.raises(ValueError, match=msg): ts.resample("1min", origin="start", backward=True) @@ -628,7 +628,9 @@ def test_backward_resample(): with pytest.raises(ValueError, match=msg): ts.resample("1min", origin="end", backward=False) - # test end origin + +def test_end_origin(): + res = ts.resample("17min", origin="end").sum().astype("int64") data = [0, 18, 27, 63] expected = Series( @@ -642,7 +644,21 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) - # test end_day origin + # an extra test case + idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") + data = np.ones(len(idx)) + s = Series(data, index=idx) + result = s.resample("7min", origin="end", closed="right").sum() + + exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T") + exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] + expected = Series(exp_data, index=exp_idx) + + tm.assert_series_equal(result, expected) + + +def test_end_day_origin(): + # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29 res = ts.resample("17min", origin="end_day").sum().astype("int64") data = [3, 15, 45, 45] @@ -657,7 +673,9 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) - # test datetime origin with backward resample + +def test_backward_resample_with_datetime_origin(): + res = ( ts.resample( "17min", @@ -700,7 +718,9 @@ def test_backward_resample(): tm.assert_series_equal(res, expected) - # test right and left close + +def test_left_and_right_close_in_backward_resample(): + res = ( ts.resample( "17min", @@ -742,15 +762,3 @@ def test_backward_resample(): ) tm.assert_series_equal(res, expected) - - # original test case - idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") - data = np.ones(len(idx)) - s = Series(data, index=idx) - result = s.resample("7min", origin="end", closed="right").sum() - - exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T") - exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] - expected = Series(exp_data, index=exp_idx) - - tm.assert_series_equal(result, expected) From 7d8d67a07dfcdc9fa39f5fa11902efc0738bd813 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 28 Nov 2020 09:35:44 +0800 Subject: [PATCH 22/42] Update generic.py --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4b1865b2a34d8..222cf0af5869b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8334,7 +8334,7 @@ def resample( 2000-10-01 23:56:00 15 2000-10-02 00:13:00 45 2000-10-02 00:30:00 45 - Freq: 17T, dtype: int32 + Freq: 17T, dtype: int64 To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: From 77fc4a3fc0acaf1783f7ed87b53da6688b0f6395 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 28 Nov 2020 15:38:36 +0800 Subject: [PATCH 23/42] doc added & tests fix --- doc/source/user_guide/timeseries.rst | 44 ++++++++++++++++++++++ doc/source/whatsnew/v1.2.0.rst | 44 ++++++++++++++++++++++ pandas/tests/resample/test_resample_api.py | 26 ++++++++++--- 3 files changed, 108 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index ac8ba2fd929a6..8044172bc4c4a 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1897,6 +1897,50 @@ Those two examples are equivalent for this time series: Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. +Backward resample +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.2.0 + +``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) + +.. ipython:: python + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . + + ts.index.max() + ts.resample("17min", origin="end").sum() + +Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . + +.. ipython:: python + + ts.resample("17min", origin="end").sum() + +If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. + +.. ipython:: python + + ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum() + +You can implement ``offset='end_day'`` in the following method equivalently. + +.. ipython:: python + + end_day_origin = ts.index.max().ceil("D") + end_day_origin + ts.resample("17min", origin=end_day_origin, backward=True).sum() + +By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. + +.. ipython:: python + + ts.resample("17min", closed="left", origin="end").sum() + .. _timeseries.periods: Time span representation diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f751a91cecf19..d1899e1d72509 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -203,6 +203,50 @@ example where the index name is preserved: The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. +.. _whatsnew_120.backward_resample: + +Backward resample +^^^^^^^^^^^^^^^^^ + +:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) + +.. ipython:: python + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . + + ts.index.max() + ts.resample("17min", origin="end").sum() + +Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . + +.. ipython:: python + + ts.resample("17min", origin="end").sum() + +If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. + +.. ipython:: python + + ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum() + +You can implement ``offset='end_day'`` in the following method equivalently. + +.. ipython:: python + + end_day_origin = ts.index.max().ceil("D") + end_day_origin + ts.resample("17min", origin=end_day_origin, backward=True).sum() + +By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. + +.. ipython:: python + + ts.resample("17min", closed="left", origin="end").sum() + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 54f2beab25d8b..3b13e16b3df82 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -613,14 +613,12 @@ def test_resample_agg_readonly(): tm.assert_series_equal(result, expected) -# test data for backward resample GH#37804 -start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" -rng = date_range(start, end, freq="7min") -ts = Series(np.arange(len(rng)) * 3, index=rng) - - def test_backward_origin_consistency(): + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + msg = "`start` or `start_day` origin isn't allowed when `backward` is True" with pytest.raises(ValueError, match=msg): ts.resample("1min", origin="start", backward=True) @@ -631,6 +629,10 @@ def test_backward_origin_consistency(): def test_end_origin(): + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + res = ts.resample("17min", origin="end").sum().astype("int64") data = [0, 18, 27, 63] expected = Series( @@ -659,6 +661,10 @@ def test_end_origin(): def test_end_day_origin(): + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29 res = ts.resample("17min", origin="end_day").sum().astype("int64") data = [3, 15, 45, 45] @@ -676,6 +682,10 @@ def test_end_day_origin(): def test_backward_resample_with_datetime_origin(): + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + res = ( ts.resample( "17min", @@ -721,6 +731,10 @@ def test_backward_resample_with_datetime_origin(): def test_left_and_right_close_in_backward_resample(): + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + res = ( ts.resample( "17min", From b49229367fc3ab02e81c8c373d05c021560054f2 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 28 Nov 2020 18:31:07 +0800 Subject: [PATCH 24/42] fix doc --- doc/source/user_guide/timeseries.rst | 28 ++++++++++++++------------ doc/source/whatsnew/v1.2.0.rst | 30 +++++++++------------------- 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index bee72ec70d95e..843da644848b1 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1888,31 +1888,39 @@ Those two examples are equivalent for this time series: Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. +.. _timeseries.backward-resample: + Backward resample ~~~~~~~~~~~~~~~~~ .. versionadded:: 1.2.0 -``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) +``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq``, but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) .. ipython:: python start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) + rng = pd.date_range(start, end, freq="7min") + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) -Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``. ts.index.max() ts.resample("17min", origin="end").sum() -Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . +The forward resample output stands for the grouping result from current datetimeindex to the next one with ``closed=left`` by default. In contrast, the backward resample output stands for the grouping result from former datetimeindex to the current one with ``closed=right`` by default. If you want to change this, ``closed=left`` is available. .. ipython:: python - ts.resample("17min", origin="end").sum() + ts.resample("17min", closed="left", origin="end").sum() + +Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``. + +.. ipython:: python -If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. + ts.resample("17min", origin="end_day").sum() + +If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set. .. ipython:: python @@ -1926,12 +1934,6 @@ You can implement ``offset='end_day'`` in the following method equivalently. end_day_origin ts.resample("17min", origin=end_day_origin, backward=True).sum() -By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. - -.. ipython:: python - - ts.resample("17min", closed="left", origin="end").sum() - .. _timeseries.periods: Time span representation diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ac8132339d38c..d45813960d5c2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -206,47 +206,35 @@ level-by-level basis. .. _whatsnew_120.backward_resample: -Backward resample +Backward resample ^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) +:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward``. ``'end'`` and ``'end_day'`` are available in argument ``offset``. Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) .. ipython:: python start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) + rng = pd.date_range(start, end, freq="7min") + ts = pd.Series(np.arange(len(rng)) * 3, index=rng) -Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``. ts.index.max() ts.resample("17min", origin="end").sum() -Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . +Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``. .. ipython:: python - ts.resample("17min", origin="end").sum() + ts.resample("17min", origin="end_day").sum() -If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. +If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set. .. ipython:: python ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum() -You can implement ``offset='end_day'`` in the following method equivalently. - -.. ipython:: python - - end_day_origin = ts.index.max().ceil("D") - end_day_origin - ts.resample("17min", origin=end_day_origin, backward=True).sum() - -By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. - -.. ipython:: python - - ts.resample("17min", closed="left", origin="end").sum() +For details, see: :ref:`timeseries.backward-resample`. .. _whatsnew_120.groupby_ewm: From 76a015acf5e00542f48202481898f31ada5789e1 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 12:52:55 +0800 Subject: [PATCH 25/42] Revert "Merge remote-tracking branch 'upstream/master'" This reverts commit 561096c991eff2c3cca2f2be7ecc5dd3b2f6a97d, reversing changes made to b49229367fc3ab02e81c8c373d05c021560054f2. --- Dockerfile | 15 +- README.md | 38 +- asv_bench/benchmarks/indexing.py | 8 - asv_bench/benchmarks/rolling.py | 14 - ci/deps/azure-38-numpydev.yaml | 2 +- doc/source/development/contributing.rst | 5 +- doc/source/reference/series.rst | 1 + doc/source/user_guide/io.rst | 40 +- doc/source/whatsnew/v1.1.5.rst | 5 - doc/source/whatsnew/v1.2.0.rst | 100 +---- pandas/_libs/groupby.pyx | 12 +- pandas/_libs/reduction.pyx | 4 +- pandas/_libs/tslibs/period.pyx | 11 +- pandas/_testing.py | 43 +- pandas/_typing.py | 4 +- pandas/compat/numpy/function.py | 16 +- pandas/conftest.py | 39 +- pandas/core/algorithms.py | 75 ++-- pandas/core/apply.py | 27 +- pandas/core/arrays/_mixins.py | 2 +- pandas/core/arrays/base.py | 22 +- pandas/core/arrays/boolean.py | 7 +- pandas/core/arrays/categorical.py | 12 +- pandas/core/arrays/datetimelike.py | 21 +- pandas/core/arrays/floating.py | 76 +++- pandas/core/arrays/integer.py | 81 +++- pandas/core/arrays/interval.py | 92 ++-- pandas/core/arrays/numeric.py | 92 ---- pandas/core/arrays/numpy_.py | 18 +- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/arrays/string_.py | 16 +- pandas/core/arrays/timedeltas.py | 4 +- pandas/core/base.py | 14 +- pandas/core/computation/pytables.py | 2 +- pandas/core/construction.py | 18 - pandas/core/dtypes/cast.py | 208 +++++---- pandas/core/dtypes/concat.py | 28 +- pandas/core/dtypes/dtypes.py | 8 +- pandas/core/dtypes/generic.py | 75 +--- pandas/core/frame.py | 39 +- pandas/core/generic.py | 118 +++-- pandas/core/groupby/base.py | 4 - pandas/core/groupby/generic.py | 188 ++++---- pandas/core/groupby/groupby.py | 216 ++++----- pandas/core/groupby/grouper.py | 12 +- pandas/core/groupby/ops.py | 162 +++---- pandas/core/indexers.py | 3 - pandas/core/indexes/base.py | 170 ++----- pandas/core/indexes/category.py | 25 +- pandas/core/indexes/datetimelike.py | 20 +- pandas/core/indexes/datetimes.py | 3 +- pandas/core/indexes/extension.py | 2 +- pandas/core/indexes/interval.py | 73 +-- pandas/core/indexes/multi.py | 67 ++- pandas/core/indexes/numeric.py | 33 +- pandas/core/indexes/period.py | 33 +- pandas/core/indexes/range.py | 33 +- pandas/core/indexing.py | 28 +- pandas/core/internals/blocks.py | 33 +- pandas/core/internals/managers.py | 21 +- pandas/core/missing.py | 140 ++---- pandas/core/nanops.py | 19 +- pandas/core/ops/__init__.py | 5 +- pandas/core/ops/array_ops.py | 34 +- pandas/core/ops/methods.py | 58 +-- pandas/core/resample.py | 2 +- pandas/core/reshape/melt.py | 4 +- pandas/core/reshape/merge.py | 28 +- pandas/core/reshape/pivot.py | 123 ++---- pandas/core/series.py | 56 +-- pandas/core/shared_docs.py | 2 +- pandas/core/sorting.py | 21 +- pandas/core/strings/accessor.py | 112 +++-- pandas/core/util/numba_.py | 2 +- pandas/core/window/rolling.py | 38 +- pandas/io/common.py | 16 +- pandas/io/excel/_base.py | 98 +---- pandas/io/excel/_xlwt.py | 4 +- pandas/io/formats/csvs.py | 14 +- pandas/io/formats/latex.py | 4 +- pandas/io/formats/printing.py | 2 +- pandas/io/formats/style.py | 20 +- pandas/io/html.py | 5 +- pandas/io/json/_json.py | 15 +- pandas/io/parquet.py | 61 +-- pandas/io/parsers.py | 28 +- pandas/io/pytables.py | 16 +- pandas/io/sas/sasreader.py | 17 +- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 5 +- pandas/tests/arithmetic/conftest.py | 12 + pandas/tests/arithmetic/test_timedelta64.py | 2 +- .../tests/arrays/boolean/test_arithmetic.py | 13 +- pandas/tests/arrays/boolean/test_function.py | 7 - .../tests/arrays/categorical/test_missing.py | 8 +- pandas/tests/arrays/floating/test_function.py | 7 - .../tests/arrays/integer/test_arithmetic.py | 52 +-- pandas/tests/arrays/integer/test_function.py | 8 - pandas/tests/arrays/masked/test_arithmetic.py | 6 +- .../tests/arrays/sparse/test_arithmetics.py | 45 +- pandas/tests/arrays/sparse/test_libsparse.py | 77 ++-- pandas/tests/arrays/string_/test_string.py | 21 - pandas/tests/arrays/test_datetimelike.py | 25 +- pandas/tests/base/test_conversion.py | 21 +- .../tests/dtypes/cast/test_convert_objects.py | 12 + pandas/tests/dtypes/test_common.py | 20 +- pandas/tests/dtypes/test_dtypes.py | 8 - pandas/tests/dtypes/test_inference.py | 4 +- pandas/tests/extension/arrow/test_bool.py | 4 - pandas/tests/extension/base/interface.py | 23 - pandas/tests/extension/base/methods.py | 8 +- pandas/tests/extension/decimal/array.py | 8 - pandas/tests/extension/json/test_json.py | 7 - pandas/tests/extension/test_boolean.py | 6 +- pandas/tests/extension/test_categorical.py | 22 - pandas/tests/extension/test_floating.py | 4 - pandas/tests/extension/test_integer.py | 9 +- pandas/tests/extension/test_string.py | 4 - pandas/tests/frame/apply/test_frame_apply.py | 6 - pandas/tests/frame/common.py | 4 +- pandas/tests/frame/indexing/test_indexing.py | 22 +- pandas/tests/frame/indexing/test_setitem.py | 18 - pandas/tests/frame/indexing/test_where.py | 12 +- pandas/tests/frame/methods/test_astype.py | 2 +- .../tests/frame/methods/test_combine_first.py | 103 ++--- pandas/tests/frame/methods/test_convert.py | 4 +- pandas/tests/frame/methods/test_diff.py | 8 +- .../frame/methods/test_drop_duplicates.py | 9 - pandas/tests/frame/methods/test_fillna.py | 6 +- pandas/tests/frame/methods/test_rename.py | 4 +- pandas/tests/frame/methods/test_replace.py | 2 +- .../tests/frame/methods/test_reset_index.py | 2 +- .../tests/frame/methods/test_sort_values.py | 8 +- pandas/tests/frame/methods/test_to_csv.py | 12 +- pandas/tests/frame/methods/test_to_records.py | 57 ++- pandas/tests/frame/test_arithmetic.py | 24 +- pandas/tests/frame/test_constructors.py | 14 +- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/frame/test_query_eval.py | 2 +- pandas/tests/frame/test_reductions.py | 32 +- pandas/tests/frame/test_stack_unstack.py | 55 +-- pandas/tests/frame/test_ufunc.py | 2 +- pandas/tests/frame/test_validate.py | 2 +- pandas/tests/generic/test_duplicate_labels.py | 6 +- pandas/tests/groupby/test_apply.py | 26 +- pandas/tests/groupby/test_function.py | 411 ++++++++--------- pandas/tests/groupby/test_groupby.py | 12 +- pandas/tests/groupby/test_grouping.py | 5 +- pandas/tests/groupby/test_nth.py | 20 - pandas/tests/groupby/test_quantile.py | 27 +- pandas/tests/groupby/test_rank.py | 77 +--- pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/groupby/test_value_counts.py | 10 +- .../tests/groupby/transform/test_transform.py | 54 +-- .../tests/indexes/base_class/test_setops.py | 2 +- .../indexes/categorical/test_category.py | 8 +- pandas/tests/indexes/common.py | 5 +- .../tests/indexes/datetimes/test_datetime.py | 51 +-- pandas/tests/indexes/datetimes/test_setops.py | 17 +- pandas/tests/indexes/interval/test_formats.py | 29 +- pandas/tests/indexes/interval/test_setops.py | 6 +- pandas/tests/indexes/multi/test_analytics.py | 3 +- pandas/tests/indexes/multi/test_drop.py | 12 - pandas/tests/indexes/multi/test_setops.py | 43 +- pandas/tests/indexes/period/test_indexing.py | 53 --- .../indexes/period/test_partial_slicing.py | 2 +- pandas/tests/indexes/ranges/test_setops.py | 56 +-- pandas/tests/indexes/test_base.py | 21 +- pandas/tests/indexes/test_numpy_compat.py | 6 +- pandas/tests/indexes/test_setops.py | 58 +-- .../indexes/timedeltas/test_timedelta.py | 11 +- .../indexing/multiindex/test_multiindex.py | 10 - .../tests/indexing/multiindex/test_slice.py | 10 - pandas/tests/indexing/test_categorical.py | 9 +- pandas/tests/indexing/test_indexers.py | 6 - pandas/tests/indexing/test_loc.py | 36 +- pandas/tests/indexing/test_scalar.py | 70 ++- pandas/tests/io/conftest.py | 3 - pandas/tests/io/excel/test_openpyxl.py | 13 +- pandas/tests/io/excel/test_readers.py | 6 +- pandas/tests/io/excel/test_style.py | 17 +- pandas/tests/io/excel/test_writers.py | 59 ++- pandas/tests/io/excel/test_xlrd.py | 46 +- pandas/tests/io/excel/test_xlsxwriter.py | 19 +- pandas/tests/io/formats/test_to_excel.py | 4 +- pandas/tests/io/formats/test_to_html.py | 4 +- pandas/tests/io/formats/test_to_latex.py | 2 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_normalize.py | 10 +- pandas/tests/io/json/test_pandas.py | 22 +- pandas/tests/io/json/test_readlines.py | 37 +- pandas/tests/io/json/test_ujson.py | 28 +- pandas/tests/io/parser/conftest.py | 2 +- pandas/tests/io/parser/test_c_parser_only.py | 12 +- pandas/tests/io/parser/test_common.py | 118 ++--- pandas/tests/io/parser/test_compression.py | 4 +- pandas/tests/io/parser/test_converters.py | 2 +- pandas/tests/io/parser/test_dtypes.py | 20 +- pandas/tests/io/parser/test_mangle_dupes.py | 2 +- pandas/tests/io/parser/test_multi_thread.py | 19 +- pandas/tests/io/parser/test_network.py | 46 +- pandas/tests/io/parser/test_parse_dates.py | 6 +- pandas/tests/io/parser/test_textreader.py | 6 +- pandas/tests/io/pytables/test_store.py | 120 ++--- pandas/tests/io/sas/test_sas7bdat.py | 39 +- pandas/tests/io/sas/test_xport.py | 41 +- pandas/tests/io/test_common.py | 12 +- pandas/tests/io/test_html.py | 12 +- pandas/tests/io/test_parquet.py | 55 +-- pandas/tests/io/test_stata.py | 10 +- pandas/tests/plotting/common.py | 7 - pandas/tests/plotting/frame/test_frame.py | 85 +++- .../tests/plotting/frame/test_frame_color.py | 29 +- .../plotting/frame/test_frame_groupby.py | 2 - .../plotting/frame/test_frame_subplots.py | 77 ++-- pandas/tests/plotting/test_backend.py | 3 - pandas/tests/plotting/test_boxplot_method.py | 30 +- pandas/tests/plotting/test_common.py | 2 - pandas/tests/plotting/test_converter.py | 3 - pandas/tests/plotting/test_datetimelike.py | 57 ++- pandas/tests/plotting/test_groupby.py | 2 - pandas/tests/plotting/test_hist_method.py | 20 +- pandas/tests/plotting/test_misc.py | 14 +- pandas/tests/plotting/test_series.py | 32 +- pandas/tests/plotting/test_style.py | 2 - pandas/tests/resample/test_datetime_index.py | 8 +- pandas/tests/resample/test_time_grouper.py | 28 +- .../tests/reshape/concat/test_categorical.py | 14 +- pandas/tests/reshape/concat/test_datetimes.py | 20 +- pandas/tests/reshape/concat/test_empty.py | 2 +- pandas/tests/reshape/concat/test_invalid.py | 6 +- pandas/tests/reshape/merge/test_merge.py | 52 +-- .../merge/test_merge_index_as_string.py | 20 +- .../tests/reshape/merge/test_merge_ordered.py | 81 ---- pandas/tests/reshape/merge/test_multi.py | 142 +++--- pandas/tests/reshape/test_crosstab.py | 33 +- pandas/tests/reshape/test_cut.py | 8 +- pandas/tests/reshape/test_qcut.py | 8 +- pandas/tests/scalar/period/test_period.py | 16 - pandas/tests/series/indexing/test_datetime.py | 2 +- pandas/tests/series/indexing/test_getitem.py | 20 +- pandas/tests/series/indexing/test_indexing.py | 11 +- pandas/tests/series/indexing/test_xs.py | 4 +- pandas/tests/series/methods/test_convert.py | 76 +++- .../series/methods/test_convert_dtypes.py | 416 +++++++++++------- .../tests/series/methods/test_interpolate.py | 80 +--- pandas/tests/series/test_arithmetic.py | 19 +- pandas/tests/test_algos.py | 3 + pandas/tests/tools/test_to_numeric.py | 42 +- pandas/tests/tseries/holiday/test_holiday.py | 16 +- .../tests/tseries/holiday/test_observance.py | 22 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- pandas/tests/tslibs/test_to_offset.py | 18 +- .../util/test_assert_categorical_equal.py | 2 +- .../util/test_assert_extension_array_equal.py | 8 +- pandas/tests/util/test_assert_frame_equal.py | 18 +- pandas/tests/util/test_assert_index_equal.py | 6 +- .../util/test_assert_interval_array_equal.py | 14 +- pandas/tests/util/test_assert_series_equal.py | 8 +- pandas/tests/util/test_show_versions.py | 2 +- pandas/tests/util/test_validate_args.py | 4 +- pandas/tests/util/test_validate_kwargs.py | 2 +- .../moments/test_moments_consistency_ewm.py | 1 + .../test_moments_consistency_rolling.py | 1 + pandas/tests/window/test_groupby.py | 79 +--- pandas/tests/window/test_rolling.py | 11 +- pandas/util/_print_versions.py | 2 +- 267 files changed, 3078 insertions(+), 4938 deletions(-) delete mode 100644 pandas/core/arrays/numeric.py create mode 100644 pandas/tests/dtypes/cast/test_convert_objects.py diff --git a/Dockerfile b/Dockerfile index de1c564921de9..5d7a2b9e6b743 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/condaforge/miniforge3 +FROM continuumio/miniconda3 # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname @@ -15,6 +15,10 @@ RUN apt-get update \ # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed && apt-get -y install git iproute2 procps iproute2 lsb-release \ # + # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill), + # needed to build pandas C extensions + && apt-get -y install build-essential \ + # # cleanup && apt-get autoremove -y \ && apt-get clean -y \ @@ -35,14 +39,9 @@ RUN mkdir "$pandas_home" \ # we just update the base/root one from the 'environment.yml' file instead of creating a new one. # # Set up environment -RUN conda install -y mamba -RUN mamba env update -n base -f "$pandas_home/environment.yml" +RUN conda env update -n base -f "$pandas_home/environment.yml" # Build C extensions and pandas -SHELL ["/bin/bash", "-c"] -RUN . /opt/conda/etc/profile.d/conda.sh \ - && conda activate base \ - && cd "$pandas_home" \ - && export \ +RUN cd "$pandas_home" \ && python setup.py build_ext -j 4 \ && python -m pip install -e . diff --git a/README.md b/README.md index 6d1d890c54093..4072faffe3b3a 100644 --- a/README.md +++ b/README.md @@ -63,24 +63,24 @@ Here are just a few of the things that pandas does well: date shifting and lagging - [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html - [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion - [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures - [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine - [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe - [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges - [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced - [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing - [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging - [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index - [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html - [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html - [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex - [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files - [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files - [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries - [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables - [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data + [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion + [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures + [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine + [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe + [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges + [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix + [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing + [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging + [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index + [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables + [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations + [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex + [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files + [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files + [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries + [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables + [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality ## Where to get it The source code is currently hosted on GitHub at: @@ -154,7 +154,7 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. +Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. ## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 4fd91c8aafe4b..74e0a3a434cde 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -358,14 +358,6 @@ def time_assign_with_setitem(self): for i in range(100): self.df[i] = np.random.randn(self.N) - def time_assign_list_like_with_setitem(self): - np.random.seed(1234) - self.df[list(range(100))] = np.random.randn(self.N, 100) - - def time_assign_list_of_columns_concat(self): - df = DataFrame(np.random.randn(self.N, 100)) - concat([self.df, df], axis=1) - class ChainIndexing: diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 5a36cff7908f0..79a33c437ea5c 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -225,20 +225,6 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() -class GroupbyLargeGroups: - # https://github.com/pandas-dev/pandas/issues/38038 - # specific example where the rolling operation on a larger dataframe - # is relatively cheap (few but large groups), but creation of - # MultiIndex of result can be expensive - - def setup(self): - N = 100000 - self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)}) - - def time_rolling_multiindex_creation(self): - self.df.groupby("A").rolling(3).mean() - - class GroupbyEWM: params = ["cython", "numba"] diff --git a/ci/deps/azure-38-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml index f11a3bcb28ab2..274be0361c2e5 100644 --- a/ci/deps/azure-38-numpydev.yaml +++ b/ci/deps/azure-38-numpydev.yaml @@ -12,7 +12,7 @@ dependencies: # pandas dependencies - pytz - - pip=20.2 + - pip - pip: - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 86d495ef2b097..3c5a88333be56 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -147,9 +147,8 @@ Creating a development environment To test out code changes, you'll need to build pandas from source, which requires a C/C++ compiler and Python environment. If you're making documentation -changes, you can skip to :ref:`contributing.documentation` but if you skip -creating the development environment you won't be able to build the documentation -locally before pushing your changes. +changes, you can skip to :ref:`contributing.documentation` but you won't be able +to build the documentation locally before pushing your changes. Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index cc2937695e80f..8d74c288bf801 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -252,6 +252,7 @@ Combining / comparing / joining / merging Series.append Series.compare + Series.replace Series.update Time Series-related diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 2b324a74fffaf..1bd35131622ab 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1577,21 +1577,19 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - with pd.read_csv("tmp.sv", sep="|", chunksize=4) as reader: - reader - for chunk in reader: - print(chunk) + reader = pd.read_csv("tmp.sv", sep="|", chunksize=4) + reader -.. versionchanged:: 1.2 + for chunk in reader: + print(chunk) - ``read_csv/json/sas`` return a context-manager when iterating through a file. Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - with pd.read_csv("tmp.sv", sep="|", iterator=True) as reader: - reader.get_chunk(5) + reader = pd.read_csv("tmp.sv", sep="|", iterator=True) + reader.get_chunk(5) .. ipython:: python :suppress: @@ -2240,10 +2238,10 @@ For line-delimited json files, pandas can also return an iterator which reads in df.to_json(orient="records", lines=True) # reader is an iterator that returns ``chunksize`` lines each iteration - with pd.read_json(StringIO(jsonl), lines=True, chunksize=1) as reader: - reader - for chunk in reader: - print(chunk) + reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) + reader + for chunk in reader: + print(chunk) .. _io.table_schema: @@ -5473,9 +5471,9 @@ object can be used as an iterator. .. ipython:: python - with pd.read_stata("stata.dta", chunksize=3) as reader: - for df in reader: - print(df.shape) + reader = pd.read_stata("stata.dta", chunksize=3) + for df in reader: + print(df.shape) For more fine-grained control, use ``iterator=True`` and specify ``chunksize`` with each call to @@ -5483,9 +5481,9 @@ For more fine-grained control, use ``iterator=True`` and specify .. ipython:: python - with pd.read_stata("stata.dta", iterator=True) as reader: - chunk1 = reader.read(5) - chunk2 = reader.read(5) + reader = pd.read_stata("stata.dta", iterator=True) + chunk1 = reader.read(5) + chunk2 = reader.read(5) Currently the ``index`` is retrieved as a column. @@ -5597,9 +5595,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: pass - with pd.read_sas("sas_xport.xpt", chunk=100000) as rdr: - for chunk in rdr: - do_something(chunk) + rdr = pd.read_sas("sas_xport.xpt", chunk=100000) + for chunk in rdr: + do_something(chunk) The specification_ for the xport file format is available from the SAS web site. diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 7164830392f35..46c4ad4f35fe4 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -19,15 +19,10 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) -- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) -- Fixed regression in :meth:`DataFrame.unstack` with columns with integer dtype (:issue:`37115`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). -- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) -- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) -- Fixed regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` where ``None`` was considered a non-NA value (:issue:`38286`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3bd3f1821f525..d45813960d5c2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -8,15 +8,6 @@ including other versions of pandas. {{ header }} -.. warning:: - - Previously, the default argument ``engine=None`` to ``pd.read_excel`` - would result in using the `xlrd `_ engine in - many cases. The engine ``xlrd`` is no longer maintained, and is not supported with - python >= 3.9. If `openpyxl `_ is installed, - many of these cases will now default to using the ``openpyxl`` engine. See the - :func:`read_excel` documentation for more details. - .. --------------------------------------------------------------------------- Enhancements @@ -33,45 +24,27 @@ prevent accidental introduction of duplicate labels, which can affect downstream By default, duplicates continue to be allowed. -.. code-block:: ipython - - In [1]: pd.Series([1, 2], index=['a', 'a']) - Out[1]: - a 1 - a 2 - Length: 2, dtype: int64 - - In [2]: pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) - ... - DuplicateLabelError: Index has duplicates. - positions - label - a [0, 1] +.. ipython:: python -pandas will propagate the ``allows_duplicate_labels`` property through many operations. + pd.Series([1, 2], index=['a', 'a']) -.. code-block:: ipython +.. ipython:: python + :okexcept: - In [3]: a = ( - ...: pd.Series([1, 2], index=['a', 'b']) - ...: .set_flags(allows_duplicate_labels=False) - ...: ) + pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) - In [4]: a - Out[4]: - a 1 - b 2 - Length: 2, dtype: int64 +pandas will propagate the ``allows_duplicate_labels`` property through many operations. - # An operation introducing duplicates - In [5]: a.reindex(['a', 'b', 'a']) - ... - DuplicateLabelError: Index has duplicates. - positions - label - a [0, 2] +.. ipython:: python + :okexcept: - [1 rows x 1 columns] + a = ( + pd.Series([1, 2], index=['a', 'b']) + .set_flags(allows_duplicate_labels=False) + ) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) .. warning:: @@ -205,9 +178,6 @@ Alternatively, you can also use the dtype object: pd.Series([1.5, None], dtype=pd.Float32Dtype()) -Operations with the existing integer or boolean nullable data types that -give float results will now also use the nullable floating data types (:issue:`38178`). - .. warning:: Experimental: the new floating data types are currently experimental, and their @@ -303,10 +273,6 @@ Other enhancements - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) -- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use - nullable dtypes that use ``pd.NA`` as missing value indicator where possible - for the resulting DataFrame (default is False, and only applicable for - ``engine="pyarrow"``) (:issue:`31242`) - Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) - :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) @@ -323,7 +289,6 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) -- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) .. --------------------------------------------------------------------------- @@ -521,7 +486,6 @@ Other API changes - Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) -- :meth:`CategoricalIndex.append` with an index that contains non-category values will now cast instead of raising ``TypeError`` (:issue:`38098`) .. --------------------------------------------------------------------------- @@ -581,7 +545,6 @@ Performance improvements - Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) -- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) .. --------------------------------------------------------------------------- @@ -596,11 +559,10 @@ Categorical - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) -- Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with timezone-aware ``datetime64`` categories incorrectly dropping the timezone information instead of casting to object dtype (:issue:`38136`) +- Datetimelike ^^^^^^^^^^^^ -- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) - Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) - Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) @@ -621,7 +583,6 @@ Datetimelike - Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) - Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) - Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) -- Bug in :class:`Period` constructor now correctly handles nanoseconds in the ``value`` argument (:issue:`34621` and :issue:`17053`) Timedelta ^^^^^^^^^ @@ -654,13 +615,11 @@ Numeric - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) - Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) -- Bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with mixed dtypes incorrectly raising ``TypeError`` (:issue:`38195`) Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`) -- Bug in :meth:`Series.astype` conversion from ``string`` to ``float`` raised in presence of ``pd.NA`` values (:issue:`37626`) - Strings @@ -692,7 +651,6 @@ Indexing - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`) - Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`) - Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) -- Bug in :meth:`DataFrame.loc` returning empty result when indexer is a slice with negative step size (:issue:`38071`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`) - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) @@ -705,20 +663,15 @@ Indexing - Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) - Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) - Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) -- Bug in :meth:`Series.at` returning :class:`Series` with one element instead of scalar when index is a :class:`MultiIndex` with one level (:issue:`38053`) - Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) - Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) -- Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`) -- Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a listlike ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`) -- Bug in :meth:`Series.loc` and :meth:`Series.iloc` setting all the values of an ``object`` Series with those of a listlike ``ExtensionArray`` instead of inserting it (:issue:`38271`) Missing ^^^^^^^ - Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) -- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`) - MultiIndex @@ -728,7 +681,6 @@ MultiIndex - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) -- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`33494`) I/O ^^^ @@ -806,14 +758,10 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) - Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) -- Bug in :meth:`.DataFrameGroupBy.apply` dropped values on ``nan`` group when returning the same axes with the original frame (:issue:`38227`) -- Bug in :meth:`.DataFrameGroupBy.quantile` couldn't handle with arraylike ``q`` when grouping by columns (:issue:`33795`) -- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`) Reshaping ^^^^^^^^^ -- Bug in :meth:`DataFrame.crosstab` was returning incorrect results on inputs with duplicate row names, duplicate column names or duplicate names between row and column labels (:issue:`22529`) - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) @@ -829,10 +777,6 @@ Reshaping - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) - Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) -- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) -- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) -- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`) -- Bug in :func:`DataFrame.drop_duplicates` not validating bool dtype for ``ignore_index`` keyword (:issue:`38274`) Sparse ^^^^^^ @@ -848,7 +792,6 @@ ExtensionArray - Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) -- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) Other ^^^^^ @@ -860,19 +803,10 @@ Other - Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) - Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`) - Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`) -- Bug in :meth:`Index.intersection` with non-:class:`Index` failing to set the correct name on the returned :class:`Index` (:issue:`38111`) -- Bug in :meth:`RangeIndex.intersection` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38197`) -- Bug in :meth:`Index.difference` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38268`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) -- Bug in :meth:`Index.intersection` with non-matching numeric dtypes casting to ``object`` dtype instead of minimal common dtype (:issue:`38122`) -- Bug in :meth:`IntervalIndex.intersection` returning an incorrectly-typed :class:`Index` when empty (:issue:`38282`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) - Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) -- Bug in :meth:`Index.drop` raising ``InvalidIndexError`` when index has duplicates (:issue:`38051`) - Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) -- Fixed bug in :func:`assert_series_equal` when comparing a datetime-like array with an equivalent non extension dtype array (:issue:`37609`) - - .. --------------------------------------------------------------------------- @@ -880,5 +814,3 @@ Other Contributors ~~~~~~~~~~~~ - -.. contributors:: v1.1.4..v1.2.0|HEAD diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5c4ba3b2729e3..24156c88f0d76 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -928,7 +928,9 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - if not checknull(val): + # None should not be treated like other NA-like + # so that it won't be converted to nan + if not checknull(val) or val is None: # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -937,7 +939,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = None + out[i, j] = NAN else: out[i, j] = resx[i, j] else: @@ -1021,7 +1023,9 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - if not checknull(val): + # None should not be treated like other NA-like + # so that it won't be converted to nan + if not checknull(val) or val is None: # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -1031,7 +1035,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = None + out[i, j] = NAN else: out[i, j] = resx[i, j] diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 4b6b71088cb7c..ad6329c588bbe 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -367,9 +367,9 @@ def apply_frame_axis0(object frame, object f, object names, try: piece = f(chunk) - except Exception as err: + except Exception: # We can't be more specific without knowing something about `f` - raise InvalidApply("Let this error raise above us") from err + raise InvalidApply('Let this error raise above us') # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cbd4e2e6704a9..d83138528a6f9 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2345,7 +2345,6 @@ class Period(_Period): if freq is not None: freq = cls._maybe_convert_freq(freq) - nanosecond = 0 if ordinal is not None and value is not None: raise ValueError("Only value or ordinal but not both should be " @@ -2395,14 +2394,6 @@ class Period(_Period): value = str(value) value = value.upper() dt, reso = parse_time_string(value, freq) - try: - ts = Timestamp(value) - except ValueError: - nanosecond = 0 - else: - nanosecond = ts.nanosecond - if nanosecond != 0: - reso = 'nanosecond' if dt is NaT: ordinal = NPY_NAT @@ -2434,7 +2425,7 @@ class Period(_Period): base = freq_to_dtype_code(freq) ordinal = period_ordinal(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, - dt.microsecond, 1000*nanosecond, base) + dt.microsecond, 0, base) return cls._from_ordinal(ordinal, freq) diff --git a/pandas/_testing.py b/pandas/_testing.py index 469f5e1bed6ba..68371b782aac2 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1456,16 +1456,7 @@ def assert_series_equal( check_dtype=check_dtype, index_values=np.asarray(left.index), ) - elif is_extension_array_dtype_and_needs_i8_conversion( - left.dtype, right.dtype - ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype): - assert_extension_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - index_values=np.asarray(left.index), - ) - elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): + elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, @@ -1875,20 +1866,6 @@ def assert_copy(iter1, iter2, **eql_kwargs): assert elem1 is not elem2, msg -def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> bool: - """ - Checks that we have the combination of an ExtensionArraydtype and - a dtype that should be converted to int64 - - Returns - ------- - bool - - Related to issue #37609 - """ - return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype) - - def getCols(k): return string.ascii_uppercase[:k] @@ -2190,15 +2167,15 @@ def makeCustomIndex( names = [names] # specific 1D index type requested? - idx_func = { - "i": makeIntIndex, - "f": makeFloatIndex, - "s": makeStringIndex, - "u": makeUnicodeIndex, - "dt": makeDateIndex, - "td": makeTimedeltaIndex, - "p": makePeriodIndex, - }.get(idx_type) + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) if idx_func: # pandas\_testing.py:2120: error: Cannot call function of unknown type idx = idx_func(nentries) # type: ignore[operator] diff --git a/pandas/_typing.py b/pandas/_typing.py index 09c490e64957d..7f01bcaa1c50e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta, tzinfo from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper from mmap import mmap -from os import PathLike +from pathlib import Path from typing import ( IO, TYPE_CHECKING, @@ -135,7 +135,7 @@ # filenames and file-like-objects Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] FileOrBuffer = Union[str, Buffer[T]] -FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[T]] +FilePathOrBuffer = Union[Path, FileOrBuffer[T]] # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c47c31fabeb70..c2e91c7877d35 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -71,7 +71,7 @@ def __call__( raise ValueError(f"invalid validation method '{method}'") -ARGMINMAX_DEFAULTS = {"out": None} +ARGMINMAX_DEFAULTS = dict(out=None) validate_argmin = CompatValidator( ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 ) @@ -151,7 +151,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): return ascending -CLIP_DEFAULTS: Dict[str, Any] = {"out": None} +CLIP_DEFAULTS: Dict[str, Any] = dict(out=None) validate_clip = CompatValidator( CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 ) @@ -208,10 +208,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 ) -LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False} +LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") -MINMAX_DEFAULTS = {"axis": None, "out": None, "keepdims": False} +MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False) validate_min = CompatValidator( MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 ) @@ -219,17 +219,17 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS: Dict[str, str] = {"order": "C"} +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS: Dict[str, Any] = {"axis": None} +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS: Dict[str, Any] = {"out": None} +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) @@ -300,7 +300,7 @@ def validate_take_with_convert(convert, args, kwargs): return convert -TRANSPOSE_DEFAULTS = {"axes": None} +TRANSPOSE_DEFAULTS = dict(axes=None) validate_transpose = CompatValidator( TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 ) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2bac2ed198789..a0ec6f96042fc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -288,6 +288,7 @@ def unique_nulls_fixture(request): # Generate cartesian product of unique_nulls_fixture: unique_nulls_fixture2 = unique_nulls_fixture + # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- @@ -320,16 +321,6 @@ def index_or_series(request): index_or_series2 = index_or_series -@pytest.fixture( - params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"] -) -def index_or_series_or_array(request): - """ - Fixture to parametrize over Index, Series, and ExtensionArray - """ - return request.param - - @pytest.fixture def dict_subclass(): """ @@ -1100,20 +1091,6 @@ def float_ea_dtype(request): return request.param -@pytest.fixture(params=tm.FLOAT_DTYPES + tm.FLOAT_EA_DTYPES) -def any_float_allowed_nullable_dtype(request): - """ - Parameterized fixture for float dtypes. - - * float - * 'float32' - * 'float64' - * 'Float32' - * 'Float64' - """ - return request.param - - @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ @@ -1432,17 +1409,3 @@ def __init__(self, **kwargs): registry.pop("testmem", None) TestMemoryFS.test[0] = None TestMemoryFS.store.clear() - - -@pytest.fixture( - params=[ - ("foo", None, None), - ("Egon", "Venkman", None), - ("NCC1701D", "NCC1701D", "NCC1701D"), - ] -) -def names(request): - """ - A 3-tuple of names, the first two for operands, the last for a result. - """ - return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9749297efd004..7bae912a070a9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -48,13 +48,11 @@ pandas_dtype, ) from pandas.core.dtypes.generic import ( - ABCDatetimeArray, ABCExtensionArray, ABCIndexClass, ABCMultiIndex, ABCRangeIndex, ABCSeries, - ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -170,7 +168,6 @@ def _ensure_data( elif is_categorical_dtype(values.dtype) and ( is_categorical_dtype(dtype) or dtype is None ): - values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") @@ -201,16 +198,8 @@ def _reconstruct_data( ------- ExtensionArray or np.ndarray """ - if isinstance(values, ABCExtensionArray) and values.dtype == dtype: - # Catch DatetimeArray/TimedeltaArray - return values - if is_extension_array_dtype(dtype): - cls = dtype.construct_array_type() - if isinstance(values, cls) and values.dtype == dtype: - return values - - values = cls._from_sequence(values) + values = dtype.construct_array_type()._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -445,8 +434,6 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: elif isinstance(values, ABCMultiIndex): # Avoid raising in extract_array values = np.array(values) - else: - values = extract_array(values, extract_numpy=True) comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) @@ -461,14 +448,11 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): # e.g. comps are integers and values are datetime64s return np.zeros(comps.shape, dtype=bool) - # TODO: not quite right ... Sparse/Categorical - elif needs_i8_conversion(values.dtype): - return isin(comps, values.astype(object)) - elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( - values.dtype - ): - return isin(np.asarray(comps), np.asarray(values)) + comps, dtype = _ensure_data(comps) + values, _ = _ensure_data(values, dtype=dtype) + + f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception @@ -481,15 +465,23 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d - - else: - common = np.find_common_type([values.dtype, comps.dtype], []) - values = values.astype(common, copy=False) - comps = comps.astype(common, copy=False) - name = common.name - if name == "bool": - name = "uint8" - f = getattr(htable, f"ismember_{name}") + elif is_integer_dtype(comps.dtype): + try: + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) + f = htable.ismember_int64 + except (TypeError, ValueError, OverflowError): + values = values.astype(object) + comps = comps.astype(object) + + elif is_float_dtype(comps.dtype): + try: + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) + f = htable.ismember_float64 + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) return f(comps, values) @@ -681,13 +673,8 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - if isinstance(values, ABCRangeIndex): - return values.factorize(sort=sort) - values = _ensure_arraylike(values) original = values - if not isinstance(values, ABCMultiIndex): - values = extract_array(values, extract_numpy=True) # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques # of values, assign na_sentinel=-1 to replace code value for NaN. @@ -696,20 +683,10 @@ def factorize( na_sentinel = -1 dropna = False - if ( - isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) - and values.freq is not None - ): - codes, uniques = values.factorize(sort=sort) - if isinstance(original, ABCIndexClass): - uniques = original._shallow_copy(uniques, name=None) - elif isinstance(original, ABCSeries): - from pandas import Index - - uniques = Index(uniques) - return codes, uniques - - if is_extension_array_dtype(values.dtype): + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) + elif is_extension_array_dtype(values.dtype): + values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6d9e11ecb824f..c5260deafc0c3 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,12 +9,7 @@ from pandas._typing import Axis, FrameOrSeriesUnion from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import ( - is_dict_like, - is_extension_array_dtype, - is_list_like, - is_sequence, -) +from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -397,20 +392,12 @@ def series_generator(self): mgr = ser._mgr blk = mgr.blocks[0] - if is_extension_array_dtype(blk.dtype): - # values will be incorrect for this block - # TODO(EA2D): special case would be unnecessary with 2D EAs - obj = self.obj - for i in range(len(obj)): - yield obj._ixs(i, axis=0) - - else: - for (arr, name) in zip(values, self.index): - # GH#35462 re-pin mgr in case setitem changed it - ser._mgr = mgr - blk.values = arr - ser.name = name - yield ser + for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr + blk.values = arr + ser.name = name + yield ser @property def result_index(self) -> "Index": diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 02214ff51b02a..5cc6525dc3c9b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -162,7 +162,7 @@ def repeat( -------- numpy.ndarray.repeat """ - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) new_data = self._ndarray.repeat(repeats, axis=axis) return self._from_backing_data(new_data) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 77cd603cc6b8d..448025e05422d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -37,7 +37,6 @@ is_array_like, is_dtype_equal, is_list_like, - is_scalar, pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -49,7 +48,7 @@ from pandas.core.missing import get_fill_func from pandas.core.sorting import nargminmax, nargsort -_extension_array_shared_docs: Dict[str, str] = {} +_extension_array_shared_docs: Dict[str, str] = dict() ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray") @@ -355,23 +354,6 @@ def __iter__(self): for i in range(len(self)): yield self[i] - def __contains__(self, item) -> bool: - """ - Return for `item in self`. - """ - # GH37867 - # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] - # would raise a TypeError. The implementation below works around that. - if is_scalar(item) and isna(item): - if not self._can_hold_na: - return False - elif item is self.dtype.na_value or isinstance(item, self.dtype.type): - return self.isna().any() - else: - return False - else: - return (item == self).any() - def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). @@ -952,7 +934,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: @Substitution(klass="ExtensionArray") @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 44cc108ed9cfd..c6c7396a980b0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -706,11 +706,10 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) + result[mask] = np.nan + return result - elif is_bool_dtype(result): + if is_bool_dtype(result): return BooleanArray(result, mask, copy=False) elif is_integer_dtype(result): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3995e7b251184..fe66aae23f510 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1269,13 +1269,15 @@ def __array__(self, dtype=None) -> np.ndarray: if dtype==None (default), the same dtype as categorical.categories.dtype. """ - ret = take_1d(self.categories._values, self._codes) + ret = take_1d(self.categories.values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) - # When we're a Categorical[ExtensionArray], like Interval, - # we need to ensure __array__ gets all the way to an - # ndarray. - return np.asarray(ret) + if is_extension_array_dtype(ret): + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ get's all the way to an + # ndarray. + ret = np.asarray(ret) + return ret def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 66906f8463336..8fa2c734092f4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -936,8 +936,7 @@ def _cmp_method(self, other, op): return result other_vals = self._unbox(other) - # GH#37462 comparison on i8 values is almost 2x faster than M8/m8 - result = op(self._ndarray.view("i8"), other_vals.view("i8")) + result = op(self._ndarray, other_vals) o_mask = isna(other) if self._hasnans | np.any(o_mask): @@ -1646,24 +1645,6 @@ def _with_freq(self, freq): arr._freq = freq return arr - # -------------------------------------------------------------- - - def factorize(self, na_sentinel=-1, sort: bool = False): - if self.freq is not None: - # We must be unique, so can short-circuit (and retain freq) - codes = np.arange(len(self), dtype=np.intp) - uniques = self.copy() # TODO: copy or view? - if sort and self.freq.n < 0: - codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") [assignment] - uniques = uniques[::-1] # type: ignore[assignment] - return codes, uniques - # FIXME: shouldn't get here; we are ignoring sort - return super().factorize(na_sentinel=na_sentinel) - # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1077538f6a21d..4aed39d7edb92 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -13,7 +13,9 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, @@ -26,8 +28,7 @@ from pandas.core.ops import invalid_comparison from pandas.core.tools.numeric import to_numeric -from .masked import BaseMaskedDtype -from .numeric import NumericArray +from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow @@ -198,7 +199,7 @@ def coerce_to_array( return values, mask -class FloatingArray(NumericArray): +class FloatingArray(BaseMaskedArray): """ Array of floating (optional missing) values. @@ -385,9 +386,9 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: # coerce if is_float_dtype(dtype): # In astype, we consider dtype=float to also mean na_value=np.nan - kwargs = {"na_value": np.nan} + kwargs = dict(na_value=np.nan) elif is_datetime64_dtype(dtype): - kwargs = {"na_value": np.datetime64("NaT")} + kwargs = dict(na_value=np.datetime64("NaT")) else: kwargs = {} @@ -477,6 +478,71 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): return type(self)(result, mask, copy=False) + def _arith_method(self, other, op): + from pandas.arrays import IntegerArray + + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, (IntegerArray, FloatingArray)): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op.__name__ == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op.__name__ == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op.__name__ == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op.__name__) + _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fa427e94fe08f..2897c18acfb09 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,11 @@ +from datetime import timedelta import numbers from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np -from pandas._libs import iNaT, lib, missing as libmissing +from pandas._libs import Timedelta, iNaT, lib, missing as libmissing from pandas._typing import ArrayLike, DtypeObj from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -15,6 +16,7 @@ is_datetime64_dtype, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, @@ -27,7 +29,6 @@ from pandas.core.tools.numeric import to_numeric from .masked import BaseMaskedArray, BaseMaskedDtype -from .numeric import NumericArray if TYPE_CHECKING: import pyarrow @@ -262,7 +263,7 @@ def coerce_to_array( return values, mask -class IntegerArray(NumericArray): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. @@ -493,7 +494,7 @@ def _values_for_argsort(self) -> np.ndarray: return data def _cmp_method(self, other, op): - from pandas.core.arrays import BooleanArray + from pandas.core.arrays import BaseMaskedArray, BooleanArray mask = None @@ -537,6 +538,73 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) + def _arith_method(self, other, op): + op_name = op.__name__ + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, IntegerArray): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + elif isinstance(other, (timedelta, np.timedelta64)): + other = Timedelta(other) + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + def sum(self, *, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) return super()._reduce("sum", skipna=skipna, min_count=min_count) @@ -568,9 +636,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) + result[mask] = np.nan + return result if result.dtype == "timedelta64[ns]": from pandas.core.arrays import TimedeltaArray diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 53a98fc43becc..efb66c9a47a97 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -44,11 +44,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import ( - array, - ensure_wrapped_if_datetimelike, - extract_array, -) +from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer @@ -57,11 +53,9 @@ _interval_shared_docs = {} -_shared_docs_kwargs = { - "klass": "IntervalArray", - "qualname": "arrays.IntervalArray", - "name": "", -} +_shared_docs_kwargs = dict( + klass="IntervalArray", qualname="arrays.IntervalArray", name="" +) _interval_shared_docs[ @@ -129,14 +123,14 @@ @Appender( _interval_shared_docs["class"] - % { - "klass": "IntervalArray", - "summary": "Pandas array for interval data that are closed on the same side.", - "versionadded": "0.24.0", - "name": "", - "extra_attributes": "", - "extra_methods": "", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side.", + versionadded="0.24.0", + name="", + extra_attributes="", + extra_methods="", + examples=textwrap.dedent( """\ Examples -------- @@ -153,7 +147,7 @@ :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. """ ), - } + ) ) class IntervalArray(IntervalMixin, ExtensionArray): ndim = 1 @@ -257,9 +251,11 @@ def _simple_new( raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - left = ensure_wrapped_if_datetimelike(left) + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) left = extract_array(left, extract_numpy=True) - right = ensure_wrapped_if_datetimelike(right) + right = maybe_upcast_datetimelike_array(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base @@ -321,9 +317,9 @@ def _from_factorized(cls, values, original): @classmethod @Appender( _interval_shared_docs["from_breaks"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ Examples -------- @@ -333,7 +329,7 @@ def _from_factorized(cls, values, original): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) @@ -392,9 +388,9 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): @classmethod @Appender( _interval_shared_docs["from_arrays"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) @@ -402,7 +398,7 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) @@ -447,9 +443,9 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): @classmethod @Appender( _interval_shared_docs["from_tuples"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ Examples -------- @@ -459,7 +455,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): Length: 2, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): @@ -906,7 +902,7 @@ def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwarg When `indices` contains negative values other than ``-1`` and `allow_fill` is True. """ - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) fill_left = fill_right = fill_value if allow_fill: @@ -1146,9 +1142,9 @@ def mid(self): @Appender( _interval_shared_docs["overlaps"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ >>> data = [(0, 1), (1, 3), (2, 4)] >>> intervals = pd.arrays.IntervalArray.from_tuples(data) @@ -1158,7 +1154,7 @@ def mid(self): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): @@ -1209,9 +1205,9 @@ def closed(self): @Appender( _interval_shared_docs["set_closed"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ Examples -------- @@ -1226,7 +1222,7 @@ def closed(self): Length: 3, closed: both, dtype: interval[int64] """ ), - } + ) ) def set_closed(self, closed): if closed not in VALID_CLOSED: @@ -1362,7 +1358,7 @@ def __arrow_array__(self, type=None): """ @Appender( - _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""} + _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") ) def to_tuples(self, na_tuple=True): tuples = com.asarray_tuplesafe(zip(self._left, self._right)) @@ -1375,7 +1371,7 @@ def to_tuples(self, na_tuple=True): @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) @@ -1414,9 +1410,9 @@ def repeat(self, repeats, axis=None): @Appender( _interval_shared_docs["contains"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) >>> intervals @@ -1425,7 +1421,7 @@ def repeat(self, repeats, axis=None): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def contains(self, other): if isinstance(other, Interval): diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py deleted file mode 100644 index 5447a84c86ac1..0000000000000 --- a/pandas/core/arrays/numeric.py +++ /dev/null @@ -1,92 +0,0 @@ -import datetime - -import numpy as np - -from pandas._libs import Timedelta, missing as libmissing -from pandas.errors import AbstractMethodError - -from pandas.core.dtypes.common import ( - is_float, - is_float_dtype, - is_integer, - is_integer_dtype, - is_list_like, -) - -from .masked import BaseMaskedArray - - -class NumericArray(BaseMaskedArray): - """ - Base class for IntegerArray and FloatingArray. - """ - - def _maybe_mask_result(self, result, mask, other, op_name: str): - raise AbstractMethodError(self) - - def _arith_method(self, other, op): - op_name = op.__name__ - omask = None - - if getattr(other, "ndim", 0) > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - - if isinstance(other, NumericArray): - other, omask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match") - if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError("can only perform ops with numeric values") - - elif isinstance(other, (datetime.timedelta, np.timedelta64)): - other = Timedelta(other) - - else: - if not (is_float(other) or is_integer(other) or other is libmissing.NA): - raise TypeError("can only perform ops with numeric values") - - if omask is None: - mask = self._mask.copy() - if other is libmissing.NA: - mask |= True - else: - mask = self._mask | omask - - if op_name == "pow": - # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) - # x ** 0 is 1. - if omask is not None: - mask = np.where((other == 0) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 0, False, mask) - - elif op_name == "rpow": - # 1 ** x is 1. - if omask is not None: - mask = np.where((other == 1) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 1, False, mask) - # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) - - if other is libmissing.NA: - result = np.ones_like(self._data) - else: - with np.errstate(all="ignore"): - result = op(self._data, other) - - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) - - return self._maybe_mask_result(result, mask, other, op_name) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 50d12703c3a30..4eb67dcd12728 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -273,12 +273,12 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]: # Reductions def any(self, *, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_any((), {"out": out, "keepdims": keepdims}) + nv.validate_any((), dict(out=out, keepdims=keepdims)) result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) def all(self, *, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_all((), {"out": out, "keepdims": keepdims}) + nv.validate_all((), dict(out=out, keepdims=keepdims)) result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) @@ -311,7 +311,7 @@ def prod(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: return self._wrap_reduction_result(axis, result) def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): - nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims}) + nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) @@ -319,7 +319,7 @@ def median( self, *, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True ): nv.validate_median( - (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims} + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) ) result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) @@ -328,7 +328,7 @@ def std( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True ): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) return self._wrap_reduction_result(axis, result) @@ -337,7 +337,7 @@ def var( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True ): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" ) result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) return self._wrap_reduction_result(axis, result) @@ -346,21 +346,21 @@ def sem( self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True ): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" ) result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) return self._wrap_reduction_result(axis, result) def kurt(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" ) result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) def skew(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" ) result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) return self._wrap_reduction_result(axis, result) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b8375af797b3a..c591f81390388 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -58,7 +58,7 @@ SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") -_sparray_doc_kwargs = {"klass": "SparseArray"} +_sparray_doc_kwargs = dict(klass="SparseArray") def _get_fill(arr: "SparseArray") -> np.ndarray: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index cc2013deb5252..e75305e55348c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -18,8 +18,7 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray -from pandas.core.arrays.floating import FloatingDtype +from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer @@ -295,19 +294,6 @@ def astype(self, dtype, copy=True): arr[mask] = 0 values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) - elif isinstance(dtype, FloatingDtype): - arr = self.copy() - mask = self.isna() - arr[mask] = "0" - values = arr.astype(dtype.numpy_dtype) - return FloatingArray(values, mask, copy=False) - elif np.issubdtype(dtype, np.floating): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = 0 - values = arr.astype(dtype) - values[mask] = np.nan - return values return super().astype(dtype, copy) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 0921c3460c626..998117cc49d50 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -373,7 +373,7 @@ def sum( min_count: int = 0, ): nv.validate_sum( - (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial} + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) ) result = nanops.nansum( @@ -391,7 +391,7 @@ def std( skipna: bool = True, ): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) diff --git a/pandas/core/base.py b/pandas/core/base.py index f333ee0f71e46..5f724d9e89d05 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -46,13 +46,13 @@ if TYPE_CHECKING: from pandas import Categorical -_shared_docs: Dict[str, str] = {} -_indexops_doc_kwargs = { - "klass": "IndexOpsMixin", - "inplace": "", - "unique": "IndexOpsMixin", - "duplicated": "IndexOpsMixin", -} +_shared_docs: Dict[str, str] = dict() +_indexops_doc_kwargs = dict( + klass="IndexOpsMixin", + inplace="", + unique="IndexOpsMixin", + duplicated="IndexOpsMixin", +) _T = TypeVar("_T", bound="IndexOpsMixin") diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index b819886687817..0498d4d171c00 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -35,7 +35,7 @@ def __init__( queryables: Optional[Dict[str, Any]] = None, ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) - self.queryables = queryables or {} + self.queryables = queryables or dict() class Term(ops.Term): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 96cf1be7520fb..f9ebe3f1e185e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -402,24 +402,6 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL return obj -def ensure_wrapped_if_datetimelike(arr): - """ - Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. - """ - if isinstance(arr, np.ndarray): - if arr.dtype.kind == "M": - from pandas.core.arrays import DatetimeArray - - return DatetimeArray._from_sequence(arr) - - elif arr.dtype.kind == "m": - from pandas.core.arrays import TimedeltaArray - - return TimedeltaArray._from_sequence(arr) - - return arr - - def sanitize_array( data, index: Optional[Index], diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 12974d56dacdc..0f0e82f4ad4e2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -99,6 +99,7 @@ from pandas import Series from pandas.core.arrays import ExtensionArray from pandas.core.indexes.base import Index + from pandas.core.indexes.datetimes import DatetimeIndex _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -296,9 +297,7 @@ def trans(x): return result -def maybe_cast_result( - result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = "" -) -> ArrayLike: +def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): """ Try casting result to a different type if appropriate @@ -318,23 +317,25 @@ def maybe_cast_result( result : array-like result maybe casted to the dtype. """ - dtype = obj.dtype + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) - assert not is_scalar(result) - - if ( - is_extension_array_dtype(dtype) - and not is_categorical_dtype(dtype) - and dtype.kind != "M" - ): - # We have to special case categorical so as not to upcast - # things like counts back to categorical - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + if not is_scalar(result): + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) return result @@ -451,9 +452,12 @@ def maybe_upcast_putmask( # NaN -> NaT # integer or integer array -> date-like array if result.dtype.kind in ["m", "M"]: - if isna(other): - other = result.dtype.type("nat") - elif is_integer(other): + if is_scalar(other): + if isna(other): + other = result.dtype.type("nat") + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): other = np.array(other, dtype=result.dtype) def changeit(): @@ -506,8 +510,9 @@ def maybe_casted_values( """ values = index._values - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) + if not isinstance(index, (ABCPeriodIndex, ABCDatetimeIndex)): + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) # if we have the codes, extract the values with a mask if codes is not None: @@ -1123,37 +1128,101 @@ def astype_nansafe( return arr.view(dtype) -def soft_convert_objects( - values: np.ndarray, - datetime: bool = True, - numeric: bool = True, - timedelta: bool = True, - copy: bool = True, -): +def maybe_convert_objects( + values: np.ndarray, convert_numeric: bool = True +) -> Union[np.ndarray, "DatetimeIndex"]: """ - Try to coerce datetime, timedelta, and numeric object-dtype columns - to inferred dtype. + If we have an object dtype array, try to coerce dates and/or numbers. Parameters ---------- - values : np.ndarray[object] - datetime : bool, default True - numeric: bool, default True - timedelta : bool, default True - copy : bool, default True + values : ndarray + convert_numeric : bool, default True Returns ------- - np.ndarray + ndarray or DatetimeIndex """ + validate_bool_kwarg(convert_numeric, "convert_numeric") + + orig_values = values + + # convert dates + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_datetime=True) + + # convert timedeltas + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_timedelta=True) + + # convert to numeric + if is_object_dtype(values.dtype): + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=True + ) + except (ValueError, TypeError): + pass + else: + # if we are all nans then leave me alone + if not isna(new_values).all(): + values = new_values + + else: + # soft-conversion + values = lib.maybe_convert_objects(values) + + if values is orig_values: + values = values.copy() + + return values + + +def soft_convert_objects( + values: np.ndarray, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + copy: bool = True, +): + """ if we have an object dtype, try to coerce dates and/or numbers """ validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: raise ValueError("At least one of datetime, numeric or timedelta must be True.") + elif conversion_count > 1 and coerce: + raise ValueError( + "Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when coerce=True." + ) + + if not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + + return to_datetime(values, errors="coerce").to_numpy() + elif timedelta: + from pandas import to_timedelta + + return to_timedelta(values, errors="coerce").to_numpy() + elif numeric: + from pandas import to_numeric + + return to_numeric(values, errors="coerce") # Soft conversions if datetime: @@ -1186,7 +1255,6 @@ def convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, - convert_floating: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, @@ -1201,10 +1269,6 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. - convert_floating : bool, defaults True - Whether, if possible, conversion can be done to floating extension types. - If `convert_integer` is also True, preference will be give to integer - dtypes if the floats can be faithfully casted to integers. Returns ------- @@ -1212,9 +1276,7 @@ def convert_dtypes( new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) - if ( - convert_string or convert_integer or convert_boolean or convert_floating - ) and not is_extension: + if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1242,29 +1304,6 @@ def convert_dtypes( if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype - if convert_floating: - if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( - input_array.dtype - ): - from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - - inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( - input_array.dtype.name, "Float64" - ) - # if we could also convert to integer, check if all floats - # are actually integers - if convert_integer: - arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): - inferred_dtype = "Int64" - else: - inferred_dtype = inferred_float_dtype - else: - inferred_dtype = inferred_float_dtype - else: - if is_float_dtype(inferred_dtype): - inferred_dtype = input_array.dtype - if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" @@ -1318,6 +1357,9 @@ def maybe_infer_to_datetimelike( value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) ): return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values v = value @@ -1409,6 +1451,9 @@ def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"): from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: + if isinstance(dtype, str): + dtype = np.dtype(dtype) + is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) @@ -1421,21 +1466,18 @@ def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64: - # unpack e.g. SparseDtype - dtype = getattr(dtype, "subtype", dtype) - if not is_dtype_equal(dtype, DT64NS_DTYPE): - - # pandas supports dtype whose granularity is less than [ns] - # e.g., [ps], [fs], [as] - if dtype <= np.dtype("M8[ns]"): - if dtype.name == "datetime64": - raise ValueError(msg) - dtype = DT64NS_DTYPE - else: - raise TypeError( - f"cannot convert datetimelike to dtype [{dtype}]" - ) + if is_datetime64 and not is_dtype_equal( + getattr(dtype, "subtype", dtype), DT64NS_DTYPE + ): + + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("M8[ns]"): + if dtype.name == "datetime64": + raise ValueError(msg) + dtype = DT64NS_DTYPE + else: + raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: # our NaT doesn't support tz's diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a9355e30cd3c2..a9b0498081511 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -18,7 +18,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray -from pandas.core.construction import array, ensure_wrapped_if_datetimelike +from pandas.core.construction import array def _get_dtype_kinds(arrays) -> Set[str]: @@ -152,7 +152,7 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: - return _concat_datetime(to_concat, axis=axis) + return _concat_datetime(to_concat, axis=axis, typs=typs) elif all_empty: # we have all empties, but may need to coerce the result dtype to @@ -346,7 +346,7 @@ def _concatenate_2d(to_concat, axis: int): return np.concatenate(to_concat, axis=axis) -def _concat_datetime(to_concat, axis=0): +def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype @@ -355,19 +355,21 @@ def _concat_datetime(to_concat, axis=0): ---------- to_concat : array of arrays axis : axis to provide concatenation + typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ - to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] + if typs is None: + typs = _get_dtype_kinds(to_concat) + to_concat = [_wrap_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: - # ensure_wrapped_if_datetimelike ensures that astype(object) wraps - # in Timestamp/Timedelta + # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: @@ -381,3 +383,17 @@ def _concat_datetime(to_concat, axis=0): assert result.shape[0] == 1 result = result[0] return result + + +def _wrap_datetimelike(arr): + """ + Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. + + DTA/TDA handle .astype(object) correctly. + """ + from pandas.core.construction import array as pd_array, extract_array + + arr = extract_array(arr, extract_numpy=True) + if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: + arr = pd_array(arr) + return arr diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 136c8032094b1..07280702cf06f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -399,14 +399,10 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str_type: if self.categories is None: - data = "None" + data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - if data is None: - # self.categories is RangeIndex - data = str(self.categories._range) - data = data.rstrip(", ") - return f"CategoricalDtype(categories={data}, ordered={self.ordered})" + return f"CategoricalDtype(categories={data}ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index dfbbaa9c1784a..0e5867809fe52 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -4,20 +4,7 @@ from typing import TYPE_CHECKING, Type, cast if TYPE_CHECKING: - from pandas import ( - CategoricalIndex, - DataFrame, - DatetimeIndex, - Float64Index, - Int64Index, - IntervalIndex, - MultiIndex, - PeriodIndex, - RangeIndex, - Series, - TimedeltaIndex, - UInt64Index, - ) + from pandas import DataFrame, Series from pandas.core.generic import NDFrame @@ -31,50 +18,28 @@ def create_pandas_abc_type(name, attr, comp): def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp - dct = {"__instancecheck__": _check, "__subclasscheck__": _check} + dct = dict(__instancecheck__=_check, __subclasscheck__=_check) meta = type("ABCBase", (type,), dct) - return meta(name, (), dct) + return meta(name, tuple(), dct) -ABCInt64Index = cast( - "Type[Int64Index]", - create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)), -) -ABCUInt64Index = cast( - "Type[UInt64Index]", - create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)), -) -ABCRangeIndex = cast( - "Type[RangeIndex]", - create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)), -) -ABCFloat64Index = cast( - "Type[Float64Index]", - create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)), -) -ABCMultiIndex = cast( - "Type[MultiIndex]", - create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)), -) -ABCDatetimeIndex = cast( - "Type[DatetimeIndex]", - create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)), -) -ABCTimedeltaIndex = cast( - "Type[TimedeltaIndex]", - create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)), -) -ABCPeriodIndex = cast( - "Type[PeriodIndex]", - create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)), -) -ABCCategoricalIndex = cast( - "Type[CategoricalIndex]", - create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)), -) -ABCIntervalIndex = cast( - "Type[IntervalIndex]", - create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)), +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type( + "ABCDatetimeIndex", "_typ", ("datetimeindex",) +) +ABCTimedeltaIndex = create_pandas_abc_type( + "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) +) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type( + "ABCCategoricalIndex", "_typ", ("categoricalindex",) +) +ABCIntervalIndex = create_pandas_abc_type( + "ABCIntervalIndex", "_typ", ("intervalindex",) ) ABCIndexClass = create_pandas_abc_type( "ABCIndexClass", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f710660d6ad8e..c9030a0b2423a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -172,14 +172,14 @@ # --------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = { - "axes": "index, columns", - "klass": "DataFrame", - "axes_single_arg": "{0 or 'index', 1 or 'columns'}", - "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 +_shared_doc_kwargs = dict( + axes="index, columns", + klass="DataFrame", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. If 1 or 'columns': apply function to each row.""", - "optional_by": """ + optional_by=""" by : str or list of str Name or list of names to sort by. @@ -187,12 +187,12 @@ levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels.""", - "optional_labels": """labels : array-like, optional + optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", - "optional_axis": """axis : int or str, optional + optional_axis="""axis : int or str, optional Axis to target. Can be either the axis name ('index', 'columns') or number (0, 1).""", -} +) _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -524,7 +524,7 @@ def __init__( return mgr = self._init_mgr( - data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) elif isinstance(data, dict): @@ -2902,7 +2902,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: 1 object dtype: object """ - nv.validate_transpose(args, {}) + nv.validate_transpose(args, dict()) # construct the args dtypes = list(self.dtypes) @@ -5273,7 +5273,6 @@ def drop_duplicates( return self.copy() inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") duplicated = self.duplicated(subset, keep=keep) result = self[-duplicated] @@ -6387,7 +6386,7 @@ def combine( otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) - arr = maybe_downcast_to_dtype(arr, new_dtype) + arr = maybe_downcast_to_dtype(arr, this_dtype) result[col] = arr @@ -7359,7 +7358,7 @@ def unstack(self, level=-1, fill_value=None): return result.__finalize__(self, method="unstack") - @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) + @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt")) def melt( self, id_vars=None, @@ -9002,11 +9001,7 @@ def idxmin(self, axis=0, skipna=True) -> Series: dtype: object """ axis = self._get_axis_number(axis) - - res = self._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame @@ -9079,11 +9074,7 @@ def idxmax(self, axis=0, skipna=True) -> Series: dtype: object """ axis = self._get_axis_number(axis) - - res = self._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 808981debf1fe..e12053b71a815 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -128,15 +128,15 @@ # goal is to be able to define the docs close to function, while still being # able to share _shared_docs = {**_shared_docs} -_shared_doc_kwargs = { - "axes": "keywords for axes", - "klass": "Series/DataFrame", - "axes_single_arg": "int or labels for object", - "args_transpose": "axes to permute (int or label for object)", - "optional_by": """ +_shared_doc_kwargs = dict( + axes="keywords for axes", + klass="Series/DataFrame", + axes_single_arg="int or labels for object", + args_transpose="axes to permute (int or label for object)", + optional_by=""" by : str or list of str Name or list of names to sort by""", -} +) bool_t = bool # Need alias because NDFrame has def bool: @@ -484,7 +484,7 @@ def _get_block_manager_axis(cls, axis: Axis) -> int: def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]]: # index or columns axis_index = getattr(self, axis) - d = {} + d = dict() prefix = axis[0] for i, name in enumerate(axis_index.names): @@ -1946,14 +1946,14 @@ def __array_ufunc__( @final def __getstate__(self) -> Dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} - return { - "_mgr": self._mgr, - "_typ": self._typ, - "_metadata": self._metadata, - "attrs": self.attrs, - "_flags": {k: self.flags[k] for k in self.flags._keys}, + return dict( + _mgr=self._mgr, + _typ=self._typ, + _metadata=self._metadata, + attrs=self.attrs, + _flags={k: self.flags[k] for k in self.flags._keys}, **meta, - } + ) @final def __setstate__(self, state): @@ -1967,7 +1967,7 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) - flags = state.get("_flags", {"allows_duplicate_labels": True}) + flags = state.get("_flags", dict(allows_duplicate_labels=True)) object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names @@ -2799,13 +2799,6 @@ def to_pickle( default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. - Compression mode may be any of the following possible - values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression - mode is ‘infer’ and path_or_buf is path-like, then detect - compression mode from the following extensions: - ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression). - If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries - passed as additional compression options. protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible @@ -6004,6 +5997,7 @@ def _convert( datetime: bool_t = False, numeric: bool_t = False, timedelta: bool_t = False, + coerce: bool_t = False, ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -6017,6 +6011,9 @@ def _convert( unconvertible values becoming NaN. timedelta : bool, default False If True, convert to timedelta where possible. + coerce : bool, default False + If True, force conversion with unconvertible values converted to + nulls (NaN or NaT). Returns ------- @@ -6025,11 +6022,13 @@ def _convert( validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") return self._constructor( self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, copy=True, ) ).__finalize__(self) @@ -6077,7 +6076,9 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: # python objects will still be converted to # native numpy numeric types return self._constructor( - self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True) + self._mgr.convert( + datetime=True, numeric=False, timedelta=True, coerce=False, copy=True + ) ).__finalize__(self, method="infer_objects") @final @@ -6087,7 +6088,6 @@ def convert_dtypes( convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, - convert_floating: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -6104,12 +6104,6 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. - convert_floating : bool, defaults True - Whether, if possible, conversion can be done to floating extension types. - If `convert_integer` is also True, preference will be give to integer - dtypes if the floats can be faithfully casted to integers. - - .. versionadded:: 1.2.0 Returns ------- @@ -6127,25 +6121,19 @@ def convert_dtypes( ----- By default, ``convert_dtypes`` will attempt to convert a Series (or each Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options - ``convert_string``, ``convert_integer``, ``convert_boolean`` and - ``convert_boolean``, it is possible to turn off individual conversions - to ``StringDtype``, the integer extension types, ``BooleanDtype`` - or floating extension types, respectively. + ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is + possible to turn off individual conversions to ``StringDtype``, the integer + extension types or ``BooleanDtype``, respectively. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer - or floating extension type, otherwise leave as ``object``. + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. Otherwise, convert to an - appropriate floating extension type. - - .. versionchanged:: 1.2 - Starting with pandas 1.2, this method also converts float columns - to the nullable floating extension type. + appropriate integer extension type. In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -6185,7 +6173,7 @@ def convert_dtypes( >>> dfn = df.convert_dtypes() >>> dfn a b c d e f - 0 1 x True h 10 + 0 1 x True h 10 NaN 1 2 y False i 100.5 2 3 z 20 200.0 @@ -6195,7 +6183,7 @@ def convert_dtypes( c boolean d string e Int64 - f Float64 + f float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6217,20 +6205,12 @@ def convert_dtypes( """ if self.ndim == 1: return self._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + infer_objects, convert_string, convert_integer, convert_boolean ) else: results = [ col._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + infer_objects, convert_string, convert_integer, convert_boolean ) for col_name, col in self.items() ] @@ -7429,7 +7409,7 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], + ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'])) >>> df @@ -7496,7 +7476,7 @@ def notna(self: FrameOrSeries) -> FrameOrSeries: >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], + ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], ... toy=[None, 'Batmobile', 'Joker'])) >>> df @@ -8234,8 +8214,8 @@ def resample( For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> d = {'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} + >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) >>> df = pd.DataFrame(d) >>> df['week_starting'] = pd.date_range('01/01/2018', ... periods=8, @@ -8260,8 +8240,8 @@ def resample( specify on which level the resampling needs to take place. >>> days = pd.date_range('1/1/2000', periods=4, freq='D') - >>> d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} + >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) >>> df2 = pd.DataFrame(d2, ... index=pd.MultiIndex.from_product([days, ... ['morning', @@ -10531,10 +10511,10 @@ def pct_change( Percentage change in French franc, Deutsche Mark, and Italian lira from 1980-01-01 to 1980-03-01. - >>> df = pd.DataFrame({ - ... 'FR': [4.0405, 4.0963, 4.3149], - ... 'GR': [1.7246, 1.7482, 1.8519], - ... 'IT': [804.74, 810.01, 860.13]}, + >>> df = pd.DataFrame(dict( + ... FR=[4.0405, 4.0963, 4.3149], + ... GR=[1.7246, 1.7482, 1.8519], + ... IT=[804.74, 810.01, 860.13]), ... index=['1980-01-01', '1980-02-01', '1980-03-01']) >>> df FR GR IT @@ -10551,10 +10531,10 @@ def pct_change( Percentage of change in GOOG and APPL stock volume. Shows computing the percentage change between columns. - >>> df = pd.DataFrame({ - ... '2016': [1769950, 30586265], - ... '2015': [1500923, 40912316], - ... '2014': [1371819, 41403351]}, + >>> df = pd.DataFrame(dict([ + ... ('2016', [1769950, 30586265]), + ... ('2015', [1500923, 40912316]), + ... ('2014', [1371819, 41403351])]), ... index=['GOOG', 'APPL']) >>> df 2016 2015 2014 diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 99426c55da29b..7dc0db35bf8fe 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -6,8 +6,6 @@ import collections from typing import List -from pandas._typing import final - from pandas.core.dtypes.common import is_list_like, is_scalar from pandas.core.base import PandasObject @@ -18,7 +16,6 @@ class ShallowMixin(PandasObject): _attributes: List[str] = [] - @final def _shallow_copy(self, obj, **kwargs): """ return a new object with the replacement attributes @@ -38,7 +35,6 @@ class GotItemMixin(PandasObject): _attributes: List[str] - @final def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 07ffb881495fa..244c47cd1f1ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -35,7 +35,9 @@ from pandas.core.dtypes.cast import ( find_common_type, + maybe_cast_result, maybe_cast_result_dtype, + maybe_convert_objects, maybe_downcast_numeric, ) from pandas.core.dtypes.common import ( @@ -45,12 +47,12 @@ is_integer_dtype, is_interval_dtype, is_numeric_dtype, + is_object_dtype, is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, nanops from pandas.core.aggregation import ( agg_list_like, aggregate, @@ -58,12 +60,13 @@ reconstruct_func, validate_func_kwargs, ) -from pandas.core.arrays import Categorical, ExtensionArray +import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame +from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, @@ -531,7 +534,7 @@ def _transform_general(self, func, *args, **kwargs): object.__setattr__(group, "name", name) res = func(group, *args, **kwargs) - if isinstance(res, (DataFrame, Series)): + if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values results.append(klass(res, index=group.index)) @@ -1026,69 +1029,43 @@ def _cython_agg_blocks( if numeric_only: data = data.get_numeric_data(copy=False) + no_result = object() + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) + assert result is not no_result dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) elif isinstance(result, np.ndarray) and result.ndim == 1: # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - - def py_fallback(bvalues: ArrayLike) -> ArrayLike: - # if self.grouper.aggregate fails, we fall back to a pure-python - # solution - - # We get here with a) EADtypes and b) object dtype - obj: FrameOrSeriesUnion - - # call our grouper again with only this block - if isinstance(bvalues, ExtensionArray): - # TODO(EA2D): special case not needed with 2D EAs - obj = Series(bvalues) - else: - obj = DataFrame(bvalues.T) - if obj.shape[1] == 1: - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = obj.iloc[:, 0] - - # Create SeriesGroupBy with observed=True so that it does - # not try to add missing categories if grouping over multiple - # Categoricals. This will done by later self._reindex_output() - # Doing it here creates an error. See GH#34951 - sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values return result def blk_func(bvalues: ArrayLike) -> ArrayLike: try: - result = self.grouper._cython_operation( - "aggregate", bvalues, how, axis=1, min_count=min_count + result, _ = self.grouper.aggregate( + bvalues, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1101,7 +1078,35 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: assert how == "ohlc" raise - result = py_fallback(bvalues) + # We get here with a) EADtypes and b) object dtype + obj: FrameOrSeriesUnion + # call our grouper again with only this block + if isinstance(bvalues, ExtensionArray): + # TODO(EA2D): special case not needed with 2D EAs + obj = Series(bvalues) + else: + obj = DataFrame(bvalues.T) + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + sgb = get_groupby(obj, self.grouper, observed=True) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values return cast_agg_result(result, bvalues, how) @@ -1145,6 +1150,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + cast = self._transform_should_cast(func) try: result[item] = colg.aggregate(func, *args, **kwargs) @@ -1157,6 +1163,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: cannot_agg.append(item) continue + else: + if cast: + result[item] = maybe_cast_result(result[item], data) + result_columns = obj.columns if cannot_agg: result_columns = result_columns.drop(cannot_agg) @@ -1274,7 +1284,7 @@ def _wrap_applied_output_series( # as we are stacking can easily have object dtypes here so = self._selected_obj if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = result._convert(datetime=True) + result = _recast_datetimelike_result(result) else: result = result._convert(datetime=True) @@ -1826,46 +1836,40 @@ def nunique(self, dropna: bool = True) -> DataFrame: self._insert_inaxis_grouper_inplace(results) return results - @Appender(DataFrame.idxmax.__doc__) - def idxmax(self, axis=0, skipna: bool = True): - axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False - - def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 - res = df._reduce( - nanops.nanargmax, - "argmax", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) - - return self._python_apply_general(func, self._obj_with_exclusions) - - @Appender(DataFrame.idxmin.__doc__) - def idxmin(self, axis=0, skipna: bool = True): - axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False - - def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 - res = df._reduce( - nanops.nanargmin, - "argmin", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) + boxplot = boxplot_frame_groupby - return self._python_apply_general(func, self._obj_with_exclusions) - boxplot = boxplot_frame_groupby +def _recast_datetimelike_result(result: DataFrame) -> DataFrame: + """ + If we have date/time like in the original, then coerce dates + as we are stacking can easily have object dtypes here. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + + Notes + ----- + - Assumes Groupby._selected_obj has ndim==2 and at least one + datetimelike column + """ + result = result.copy() + + obj_cols = [ + idx + for idx in range(len(result.columns)) + if is_object_dtype(result.dtypes.iloc[idx]) + ] + + # See GH#26285 + for n in obj_cols: + converted = maybe_convert_objects( + result.iloc[:, n].values, convert_numeric=False + ) + + result.iloc[:, n] = converted + return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 23f0e178130be..ae3612c99d5cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,7 +11,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import inspect -from textwrap import dedent +import re import types from typing import ( Callable, @@ -45,13 +45,12 @@ class providing the base-class of operations. IndexLabel, Label, Scalar, - final, ) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, is_bool_dtype, @@ -86,8 +85,8 @@ class providing the base-class of operations. to each row or column of a DataFrame. """ -_apply_docs = { - "template": """ +_apply_docs = dict( + template=""" Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a {input} as its first @@ -124,7 +123,7 @@ class providing the base-class of operations. Series.apply : Apply a function to a Series. DataFrame.apply : Apply a function to each row or column of a DataFrame. """, - "dataframe_examples": """ + dataframe_examples=""" >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) @@ -164,7 +163,7 @@ class providing the base-class of operations. b 2 dtype: int64 """, - "series_examples": """ + series_examples=""" >>> s = pd.Series([0, 1, 2], index='a a b'.split()) >>> g = s.groupby(s.index) @@ -203,7 +202,7 @@ class providing the base-class of operations. -------- {examples} """, -} +) _groupby_agg_method_template = """ Compute {fname} of group values. @@ -449,7 +448,6 @@ class providing the base-class of operations. """ -@final class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects. @@ -573,11 +571,9 @@ def __init__( self.grouper = grouper self.exclusions = exclusions or set() - @final def __len__(self) -> int: return len(self.groups) - @final def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) @@ -589,7 +585,6 @@ def _assure_grouper(self) -> None: """ pass - @final @property def groups(self) -> Dict[Hashable, np.ndarray]: """ @@ -598,13 +593,11 @@ def groups(self) -> Dict[Hashable, np.ndarray]: self._assure_grouper() return self.grouper.groups - @final @property def ngroups(self) -> int: self._assure_grouper() return self.grouper.ngroups - @final @property def indices(self): """ @@ -613,7 +606,6 @@ def indices(self): self._assure_grouper() return self.grouper.indices - @final def _get_indices(self, names): """ Safe get multiple indices, translate keys for @@ -664,14 +656,12 @@ def get_converter(s): return [self.indices.get(name, []) for name in names] - @final def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ return self._get_indices([name])[0] - @final @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy @@ -683,7 +673,6 @@ def _selected_obj(self): else: return self.obj[self._selection] - @final def _reset_group_selection(self) -> None: """ Clear group based selection. @@ -696,7 +685,6 @@ def _reset_group_selection(self) -> None: self._group_selection = None self._reset_cache("_selected_obj") - @final def _set_group_selection(self) -> None: """ Create group based selection. @@ -722,7 +710,6 @@ def _set_group_selection(self) -> None: self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - @final def _set_result_index_ordered( self, result: "OutputFrameOrSeries" ) -> "OutputFrameOrSeries": @@ -739,7 +726,6 @@ def _set_result_index_ordered( result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result - @final def _dir_additions(self) -> Set[str]: return self.obj._dir_additions() | self._apply_allowlist @@ -755,25 +741,23 @@ def __getattr__(self, attr: str): @Substitution( klass="GroupBy", - examples=dedent( - """\ - >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) - >>> df - A B - 0 a 1 - 1 b 2 - 2 a 3 - 3 b 4 - - To get the difference between each groups maximum and minimum value in one - pass, you can do - - >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) - B - A - a 2 - b 2""" - ), + examples="""\ +>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) +>>> df + A B +0 a 1 +1 b 2 +2 a 3 +3 b 4 + +To get the difference between each groups maximum and minimum value in one +pass, you can do + +>>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B +A +a 2 +b 2""", ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): @@ -781,7 +765,6 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - @final def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist @@ -814,12 +797,27 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - return self._python_apply_general(curried, self._obj_with_exclusions) + try: + return self._python_apply_general(curried, self._obj_with_exclusions) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise + + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError + raise ValueError + + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result wrapper.__name__ = name return wrapper - @final def get_group(self, name, obj=None): """ Construct DataFrame from group with provided name. @@ -906,7 +904,6 @@ def f(g): return result - @final def _python_apply_general( self, f: F, data: FrameOrSeriesUnion ) -> FrameOrSeriesUnion: @@ -937,7 +934,6 @@ def _iterate_slices(self) -> Iterable[Series]: def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) - @final def _cumcount_array(self, ascending: bool = True): """ Parameters @@ -970,12 +966,24 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - @final - def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs - ): - output: Dict[base.OutputKey, np.ndarray] = {} + def _transform_should_cast(self, func_nm: str) -> bool: + """ + Parameters + ---------- + func_nm: str + The name of the aggregation function being performed + Returns + ------- + bool + Whether transform should attempt to cast the result of aggregation + """ + filled_series = self.grouper.size().fillna(0) + assert filled_series is not None + return filled_series.gt(0).any() and func_nm not in base.cython_cast_blocklist + + def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + output: Dict[base.OutputKey, np.ndarray] = {} for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) @@ -983,12 +991,13 @@ def _cython_transform( continue try: - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + if self._transform_should_cast(how): + result = maybe_cast_result(result, obj, how=how) + key = base.OutputKey(label=name, position=idx) output[key] = result @@ -1008,7 +1017,6 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - @final def _agg_general( self, numeric_only: bool = True, @@ -1059,22 +1067,21 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count + result, agg_names = self.grouper.aggregate( + obj._values, how, min_count=min_count ) - if how == "ohlc": + if agg_names: # e.g. ohlc - agg_names = ["open", "high", "low", "close"] assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = result_column + output[key] = maybe_cast_result(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = result + output[key] = maybe_cast_result(result, obj, how=how) idx += 1 if not output: @@ -1082,7 +1089,6 @@ def _cython_agg_general( return self._wrap_aggregated_output(output, index=self.grouper.result_index) - @final def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby transform routine with the numba engine. @@ -1117,7 +1123,6 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) # evaluated the data sorted by group return result.take(np.argsort(sorted_index), axis=0) - @final def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby aggregation routine with the numba engine. @@ -1154,7 +1159,6 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) index = Index(group_keys, name=self.grouper.names[0]) return result, index - @final def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1176,28 +1180,25 @@ def _python_agg_general(self, func, *args, **kwargs): assert result is not None key = base.OutputKey(label=name, position=idx) + output[key] = maybe_cast_result(result, obj, numeric_only=True) - if is_numeric_dtype(obj.dtype): - result = maybe_downcast_to_dtype(result, obj.dtype) + if not output: + return self._python_apply_general(f, self._selected_obj) - if self.grouper._filter_empty_groups: - mask = counts.ravel() > 0 + if self.grouper._filter_empty_groups: + + mask = counts.ravel() > 0 + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - result = maybe_downcast_to_dtype(values[mask], result.dtype) - - output[key] = result - - if not output: - return self._python_apply_general(f, self._selected_obj) + output[key] = maybe_cast_result(values[mask], result) return self._wrap_aggregated_output(output, index=self.grouper.result_index) - @final def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -1211,7 +1212,7 @@ def reset_identity(values): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.filter(lambda x: True).axes[self.axis] + ax = self._selected_obj._get_axis(self.axis) # this is a very unfortunate situation # we can't use reindex to restore the original order @@ -1259,7 +1260,6 @@ def reset_identity(values): return result - @final def _apply_filter(self, indices, dropna): if len(indices) == 0: indices = np.array([], dtype="int64") @@ -1349,7 +1349,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): more """ - @final @property def _obj_1d_constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames @@ -1358,7 +1357,6 @@ def _obj_1d_constructor(self) -> Type["Series"]: assert isinstance(self.obj, Series) return self.obj._constructor - @final def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1388,7 +1386,6 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: skipna=skipna, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna: bool = True): @@ -1408,7 +1405,6 @@ def any(self, skipna: bool = True): """ return self._bool_agg("any", skipna) - @final @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna: bool = True): @@ -1442,7 +1438,6 @@ def count(self): # defined here for API doc raise NotImplementedError - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, numeric_only: bool = True): @@ -1499,7 +1494,6 @@ def mean(self, numeric_only: bool = True): numeric_only=numeric_only, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def median(self, numeric_only=True): @@ -1525,7 +1519,6 @@ def median(self, numeric_only=True): numeric_only=numeric_only, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof: int = 1): @@ -1555,7 +1548,6 @@ def std(self, ddof: int = 1): ddof=ddof, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof: int = 1): @@ -1583,7 +1575,6 @@ def var(self, ddof: int = 1): with group_selection_context(self): return self._python_agg_general(func) - @final @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof: int = 1): @@ -1614,7 +1605,6 @@ def sem(self, ddof: int = 1): ) return result - @final @Substitution(name="groupby") @Appender(_common_see_also) def size(self) -> FrameOrSeriesUnion: @@ -1640,7 +1630,6 @@ def size(self) -> FrameOrSeriesUnion: return self._reindex_output(result, fill_value=0) - @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): @@ -1657,28 +1646,24 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): return self._reindex_output(result, fill_value=0) - @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) - @final @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) - @final @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) - @final @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): @@ -1703,7 +1688,6 @@ def first(x: Series): npfunc=first_compat, ) - @final @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): @@ -1728,7 +1712,6 @@ def last(x: Series): npfunc=last_compat, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self) -> DataFrame: @@ -1744,7 +1727,6 @@ def ohlc(self) -> DataFrame: """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) - @final @doc(DataFrame.describe) def describe(self, **kwargs): with group_selection_context(self): @@ -1753,7 +1735,6 @@ def describe(self, **kwargs): return result.T return result.unstack() - @final def resample(self, rule, *args, **kwargs): """ Provide resampling when using a TimeGrouper. @@ -1855,7 +1836,6 @@ def resample(self, rule, *args, **kwargs): return get_resampler_for_grouping(self, rule, *args, **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): @@ -1866,7 +1846,6 @@ def rolling(self, *args, **kwargs): return RollingGroupby(self, *args, **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): @@ -1878,7 +1857,6 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def ewm(self, *args, **kwargs): @@ -1889,7 +1867,6 @@ def ewm(self, *args, **kwargs): return ExponentialMovingWindowGroupby(self, *args, **kwargs) - @final def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -1928,7 +1905,6 @@ def _fill(self, direction, limit=None): dropna=self.dropna, ) - @final @Substitution(name="groupby") def pad(self, limit=None): """ @@ -1955,7 +1931,6 @@ def pad(self, limit=None): ffill = pad - @final @Substitution(name="groupby") def backfill(self, limit=None): """ @@ -1982,7 +1957,6 @@ def backfill(self, limit=None): bfill = backfill - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: @@ -2156,7 +2130,6 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra return result - @final def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -2254,38 +2227,30 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) for qi in q ] - result = concat(results, axis=self.axis, keys=q) + result = concat(results, axis=0, keys=q) # fix levels to place quantiles on the inside # TODO(GH-10710): Ideally, we could write this as # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = list(range(1, result.axes[self.axis].nlevels)) + [0] + order = list(range(1, result.index.nlevels)) + [0] # temporarily saves the index names - index_names = np.array(result.axes[self.axis].names) + index_names = np.array(result.index.names) # set index names to positions to avoid confusion - result.axes[self.axis].names = np.arange(len(index_names)) + result.index.names = np.arange(len(index_names)) # place quantiles on the inside - if isinstance(result, Series): - result = result.reorder_levels(order) - else: - result = result.reorder_levels(order, axis=self.axis) + result = result.reorder_levels(order) # restore the index names in order - result.axes[self.axis].names = index_names[order] + result.index.names = index_names[order] # reorder rows to keep things sorted - indices = ( - np.arange(result.shape[self.axis]) - .reshape([len(q), self.ngroups]) - .T.flatten() - ) - return result.take(indices, axis=self.axis) + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() + return result.take(indices) - @final @Substitution(name="groupby") def ngroup(self, ascending: bool = True): """ @@ -2353,7 +2318,6 @@ def ngroup(self, ascending: bool = True): result = self.ngroups - 1 - result return result - @final @Substitution(name="groupby") def cumcount(self, ascending: bool = True): """ @@ -2413,7 +2377,6 @@ def cumcount(self, ascending: bool = True): cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) - @final @Substitution(name="groupby") @Appender(_common_see_also) def rank( @@ -2463,7 +2426,6 @@ def rank( axis=axis, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): @@ -2480,7 +2442,6 @@ def cumprod(self, axis=0, *args, **kwargs): return self._cython_transform("cumprod", **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): @@ -2497,7 +2458,6 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform("cumsum", **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): @@ -2513,7 +2473,6 @@ def cummin(self, axis=0, **kwargs): return self._cython_transform("cummin", numeric_only=False) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): @@ -2529,7 +2488,6 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform("cummax", numeric_only=False) - @final def _get_cythonized_result( self, how: str, @@ -2688,7 +2646,6 @@ def _get_cythonized_result( else: return self._wrap_transformed_output(output) - @final @Substitution(name="groupby") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2732,7 +2689,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): periods=periods, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): @@ -2762,7 +2718,6 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) return (filled / shifted) - 1 - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): @@ -2800,7 +2755,6 @@ def head(self, n=5): else: return self._selected_obj.iloc[:, mask] - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): @@ -2838,7 +2792,6 @@ def tail(self, n=5): else: return self._selected_obj.iloc[:, mask] - @final def _reindex_output( self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN ) -> OutputFrameOrSeries: @@ -2925,7 +2878,6 @@ def _reindex_output( return output.reset_index(drop=True) - @final def sample( self, n: Optional[int] = None, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 496aa6f327096..261190747ee61 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._typing import FrameOrSeries, Label, final +from pandas._typing import FrameOrSeries, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -18,6 +18,7 @@ is_scalar, is_timedelta64_dtype, ) +from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray @@ -314,7 +315,6 @@ def __init__( self._grouper = None self.dropna = dropna - @final @property def ax(self): return self.grouper @@ -346,7 +346,6 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - @final def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper @@ -371,7 +370,9 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if self.key is not None: key = self.key # The 'on' is already defined - if getattr(self.grouper, "name", None) == key and isinstance(obj, Series): + if getattr(self.grouper, "name", None) == key and isinstance( + obj, ABCSeries + ): # pandas\core\groupby\grouper.py:348: error: Item "None" of # "Optional[Any]" has no attribute "take" [union-attr] ax = self._grouper.take(obj.index) # type: ignore[union-attr] @@ -406,14 +407,12 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): self.grouper = ax return self.grouper - @final @property def groups(self): # pandas\core\groupby\grouper.py:382: error: Item "None" of # "Optional[Any]" has no attribute "groups" [union-attr] return self.grouper.groups # type: ignore[union-attr] - @final def __repr__(self) -> str: attrs_list = ( f"{attr_name}={repr(getattr(self, attr_name))}" @@ -425,7 +424,6 @@ def __repr__(self) -> str: return f"{cls_name}({attrs})" -@final class Grouping: """ Holds the grouping information for a single key diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c60a59916affc..50c4cc53a12bb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,15 +24,11 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape, final +from pandas._typing import F, FrameOrSeries, Label, Shape from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import ( - maybe_cast_result, - maybe_cast_result_dtype, - maybe_downcast_to_dtype, -) +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, ensure_float64, @@ -146,7 +142,6 @@ def get_iterator( for key, (i, group) in zip(keys, splitter): yield key, group.__finalize__(data, method="groupby") - @final def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": """ Returns @@ -167,7 +162,6 @@ def _get_grouper(self): """ return self.groupings[0].grouper - @final def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] @@ -177,7 +171,6 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) - @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -259,7 +252,6 @@ def levels(self) -> List[Index]: def names(self) -> List[Label]: return [ping.name for ping in self.groupings] - @final def size(self) -> Series: """ Compute group sizes. @@ -282,7 +274,6 @@ def groups(self) -> Dict[Hashable, np.ndarray]: to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) - @final @cache_readonly def is_monotonic(self) -> bool: # return if my group orderings are monotonic @@ -296,7 +287,6 @@ def group_info(self): comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups - @final @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis @@ -306,7 +296,6 @@ def codes_info(self) -> np.ndarray: codes = codes[sorter] return codes - @final def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: all_codes = self.codes if len(all_codes) > 1: @@ -316,7 +305,6 @@ def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) - @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @@ -338,7 +326,6 @@ def result_index(self) -> Index: levels=levels, codes=codes, verify_integrity=False, names=self.names ) - @final def get_group_levels(self) -> List[Index]: if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].result_index] @@ -379,7 +366,8 @@ def get_group_levels(self) -> List[Index]: _cython_arity = {"ohlc": 4} # OHLC - @final + _name_functions = {"ohlc": ["open", "high", "low", "close"]} + def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -387,7 +375,6 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - @final def _get_cython_function( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -424,7 +411,6 @@ def _get_cython_function( return func - @final def _get_cython_func_and_vals( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -459,82 +445,17 @@ def _get_cython_func_and_vals( raise return func, values - @final - def _disallow_invalid_ops(self, values: ArrayLike, how: str): - """ - Check if we can do this operation with our cython functions. - - Raises - ------ - NotImplementedError - This is either not a valid function for this dtype, or - valid but not implemented in cython. - """ - dtype = values.dtype - - if is_categorical_dtype(dtype) or is_sparse(dtype): - # categoricals are only 1d, so we - # are not setup for dim transforming - raise NotImplementedError(f"{dtype} dtype not supported") - elif is_datetime64_any_dtype(dtype): - # we raise NotImplemented if this is an invalid operation - # entirely, e.g. adding datetimes - if how in ["add", "prod", "cumsum", "cumprod"]: - raise NotImplementedError( - f"datetime64 type does not support {how} operations" - ) - elif is_timedelta64_dtype(dtype): - if how in ["prod", "cumprod"]: - raise NotImplementedError( - f"timedelta64 type does not support {how} operations" - ) - - @final - def _ea_wrap_cython_operation( + def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ - If we have an ExtensionArray, unwrap, call _cython_operation, and - re-wrap if appropriate. - """ - # TODO: general case implementation overrideable by EAs. - orig_values = values - - if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): - # All of the functions implemented here are ordinal, so we can - # operate on the tz-naive equivalents - values = values.view("M8[ns]") - res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs - ) - if how in ["rank"]: - # preserve float64 dtype - return res_values - - res_values = res_values.astype("i8", copy=False) - result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) - return result - - elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): - # IntegerArray or BooleanArray - values = ensure_int_or_float(values) - res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs - ) - result = maybe_cast_result(result=res_values, obj=orig_values, how=how) - return result - - raise NotImplementedError(values.dtype) + Returns the values of a cython operation as a Tuple of [data, names]. - @final - def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs - ) -> np.ndarray: + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). """ - Returns the values of a cython operation. - """ - orig_values = values assert kind in ["transform", "aggregate"] + orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") @@ -545,12 +466,30 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError - self._disallow_invalid_ops(values, how) - if is_extension_array_dtype(values.dtype): - return self._ea_wrap_cython_operation( - kind, values, how, axis, min_count, **kwargs - ) + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + + # categoricals are only 1d, so we + # are not setup for dim transforming + if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): + raise NotImplementedError(f"{values.dtype} dtype not supported") + elif is_datetime64_any_dtype(values.dtype): + if how in ["add", "prod", "cumsum", "cumprod"]: + raise NotImplementedError( + f"datetime64 type does not support {how} operations" + ) + elif is_timedelta64_dtype(values.dtype): + if how in ["prod", "cumprod"]: + raise NotImplementedError( + f"timedelta64 type does not support {how} operations" + ) + + if is_datetime64tz_dtype(values.dtype): + # Cast to naive; we'll cast back at the end of the function + # TODO: possible need to reshape? + # TODO(EA2D):kludge can be avoided when 2D EA is allowed. + values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) @@ -629,18 +568,36 @@ def _cython_operation( if vdim == 1 and arity == 1: result = result[:, 0] + names: Optional[List[str]] = self._name_functions.get(how, None) + if swapped: result = result.swapaxes(0, axis) - if how not in base.cython_cast_blocklist: - # e.g. if we are int64 and need to restore to datetime64/timedelta64 - # "rank" is the only member of cython_cast_blocklist we get here - dtype = maybe_cast_result_dtype(orig_values.dtype, how) - result = maybe_downcast_to_dtype(result, dtype) + if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( + orig_values.dtype + ): + # We need to use the constructors directly for these dtypes + # since numpy won't recognize them + # https://github.com/pandas-dev/pandas/issues/31471 + result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) + + if is_extension_array_dtype(orig_values.dtype): + result = maybe_cast_result(result=result, obj=orig_values, how=how) - return result + return result, names + + def aggregate( + self, values, how: str, axis: int = 0, min_count: int = -1 + ) -> Tuple[np.ndarray, Optional[List[str]]]: + return self._cython_operation( + "aggregate", values, how, axis, min_count=min_count + ) + + def transform(self, values, how: str, axis: int = 0, **kwargs): + return self._cython_operation("transform", values, how, axis, **kwargs) - @final def _aggregate( self, result, counts, values, comp_ids, agg_func, min_count: int = -1 ): @@ -652,7 +609,6 @@ def _aggregate( return result - @final def _transform( self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): @@ -691,7 +647,6 @@ def agg_series(self, obj: Series, func: F): raise return self._aggregate_series_pure_python(obj, func) - @final def _aggregate_series_fast(self, obj: Series, func: F): # At this point we have already checked that # - obj.index is not a MultiIndex @@ -711,7 +666,6 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - @final def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info @@ -737,7 +691,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): result[label] = res result = lib.maybe_convert_objects(result, try_float=0) - result = maybe_cast_result(result, obj, numeric_only=True) + # TODO: maybe_cast_to_extension_array? return result, counts diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index da4654bbf2c10..b6713bc760c5e 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -79,9 +79,6 @@ def is_scalar_indexer(indexer, ndim: int) -> bool: ------- bool """ - if ndim == 1 and is_integer(indexer): - # GH37748: allow indexer to be an integer for Series - return True if isinstance(indexer, tuple): if len(indexer) == ndim: return all( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ba958b23e81af..c49f3f9457161 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -33,7 +33,6 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import ( - find_common_type, maybe_cast_to_integer_array, validate_numeric_casting, ) @@ -70,6 +69,7 @@ ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, + ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) @@ -106,15 +106,15 @@ _unsortable_types = frozenset(("mixed", "mixed-integer")) -_index_doc_kwargs = { - "klass": "Index", - "inplace": "", - "target_klass": "Index", - "raises_section": "", - "unique": "Index", - "duplicated": "np.ndarray", -} -_index_shared_docs = {} +_index_doc_kwargs = dict( + klass="Index", + inplace="", + target_klass="Index", + raises_section="", + unique="Index", + duplicated="np.ndarray", +) +_index_shared_docs = dict() str_t = str @@ -817,7 +817,7 @@ def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): repeats = ensure_platform_int(repeats) - nv.validate_repeat(tuple(), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) # -------------------------------------------------------------------- @@ -2155,7 +2155,7 @@ def is_all_dates(self): # Pickle Methods def __reduce__(self): - d = {"data": self._data} + d = dict(data=self._data) d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None @@ -2379,10 +2379,6 @@ def unique(self, level=None): """ if level is not None: self._validate_index_level(level) - - if self.is_unique: - return self._shallow_copy() - result = super().unique() return self._shallow_copy(result) @@ -2694,7 +2690,7 @@ def union(self, other, sort=None): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) + other = ensure_index(other) if not self._can_union_without_object_cast(other): return self._union_incompatible_dtypes(other, sort=sort) @@ -2824,15 +2820,14 @@ def intersection(self, other, sort=False): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + other = ensure_index(other) - if self.equals(other) and not self.has_duplicates: + if self.equals(other): return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - dtype = find_common_type([self.dtype, other.dtype]) - this = self.astype(dtype, copy=False) - other = other.astype(dtype, copy=False) + this = self.astype("O") + other = other.astype("O") return this.intersection(other, sort=sort) result = self._intersection(other, sort=sort) @@ -2852,7 +2847,7 @@ def _intersection(self, other, sort=False): except TypeError: pass else: - return algos.unique1d(result) + return result try: indexer = Index(rvals).get_indexer(lvals) @@ -2863,14 +2858,11 @@ def _intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - result = other.take(indexer).unique()._values + result = other.take(indexer)._values if sort is None: result = algos.safe_sort(result) - # Intersection has to be unique - assert Index(result).is_unique - return result def difference(self, other, sort=None): @@ -2913,15 +2905,12 @@ def difference(self, other, sort=None): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) if self.equals(other): - return self[:0].rename(result_name) + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - result = self._difference(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _difference(self, other, sort): + other, result_name = self._convert_can_do_setop(other) this = self._get_unique_index() @@ -2936,7 +2925,7 @@ def _difference(self, other, sort): except TypeError: pass - return the_diff + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None, sort=None): """ @@ -3496,7 +3485,12 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - target = self[:0] + values: Union[range, ExtensionArray, np.ndarray] + if isinstance(self, ABCRangeIndex): + values = range(0) + else: + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, name=self.name) else: target = ensure_index(target) @@ -3826,7 +3820,6 @@ def _join_non_unique(self, other, how="left", return_indexers=False): else: return join_index - @final def _join_level( self, other, level, how="left", return_indexers=False, keep_order=True ): @@ -3970,7 +3963,6 @@ def _get_leaf_sorter(labels): else: return join_index - @final def _join_monotonic(self, other, how="left", return_indexers=False): # We only get here with matching dtypes assert other.dtype == self.dtype @@ -4188,6 +4180,12 @@ def _coerce_scalar_to_index(self, item): return Index([item], dtype=dtype, **self._get_attributes_dict()) + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self + def _validate_fill_value(self, value): """ Check if the value can be inserted into our array, and convert @@ -4738,10 +4736,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError( - f"This method is only implemented for DatetimeIndex, PeriodIndex and " - f"TimedeltaIndex; Got type {type(self).__name__}" - ) + raise NotImplementedError(f"Not supported for type {type(self).__name__}") def argsort(self, *args, **kwargs) -> np.ndarray: """ @@ -4906,31 +4901,16 @@ def get_indexer_non_unique(self, target): # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. # (GH #16877) - return self._get_indexer_non_comparable(target, method=None, unique=False) + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if not self._should_compare(target): - return self._get_indexer_non_comparable(target, method=None, unique=False) - - if not is_dtype_equal(self.dtype, target.dtype): - # TODO: if object, could use infer_dtype to pre-empt costly - # conversion if still non-comparable? - dtype = find_common_type([self.dtype, target.dtype]) - if ( - dtype.kind in ["i", "u"] - and is_categorical_dtype(target.dtype) - and target.hasnans - ): - # FIXME: find_common_type incorrect with Categorical GH#38240 - # FIXME: some cases where float64 cast can be lossy? - dtype = np.dtype(np.float64) - - this = self.astype(dtype, copy=False) - that = target.astype(dtype, copy=False) - return this.get_indexer_non_unique(that) + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) @@ -4958,43 +4938,6 @@ def get_indexer_for(self, target, **kwargs): indexer, _ = self.get_indexer_non_unique(target) return indexer - def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True): - """ - Called from get_indexer or get_indexer_non_unique when the target - is of a non-comparable dtype. - - For get_indexer lookups with method=None, get_indexer is an _equality_ - check, so non-comparable dtypes mean we will always have no matches. - - For get_indexer lookups with a method, get_indexer is an _inequality_ - check, so non-comparable dtypes mean we will always raise TypeError. - - Parameters - ---------- - target : Index - method : str or None - unique : bool, default True - * True if called from get_indexer. - * False if called from get_indexer_non_unique. - - Raises - ------ - TypeError - If doing an inequality check, i.e. method is not None. - """ - if method is not None: - other = unpack_nested_dtype(target) - raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") - - no_matches = -1 * np.ones(target.shape, dtype=np.intp) - if unique: - # This is for get_indexer - return no_matches - else: - # This is for get_indexer_non_unique - missing = np.arange(len(target), dtype=np.intp) - return no_matches, missing - @property def _index_as_unique(self): """ @@ -5030,14 +4973,6 @@ def _maybe_promote(self, other: "Index"): return self, other - def _should_compare(self, other: "Index") -> bool: - """ - Check if `self == other` can ever have non-False entries. - """ - other = unpack_nested_dtype(other) - dtype = other.dtype - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? @@ -5581,7 +5516,7 @@ def drop(self, labels, errors: str_t = "raise"): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - indexer = self.get_indexer_for(labels) + indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": @@ -6185,24 +6120,3 @@ def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]: name_sets = [{*ns} for ns in zip_longest(*name_tups)] names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names - - -def unpack_nested_dtype(other: Index) -> Index: - """ - When checking if our dtype is comparable with another, we need - to unpack CategoricalDtype to look at its categories.dtype. - - Parameters - ---------- - other : Index - - Returns - ------- - Index - """ - dtype = other.dtype - if is_categorical_dtype(dtype): - # If there is ever a SparseIndex, this could get dispatched - # here too. - return dtype.categories - return other diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 377fff5f85e92..e2507aeaeb652 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -27,7 +27,7 @@ import pandas.core.missing as missing _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update({"target_klass": "CategoricalIndex"}) +_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @inherit_names( @@ -399,6 +399,10 @@ def unique(self, level=None): # of result, not self. return type(self)._simple_new(result, name=self.name) + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.astype("object") + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -554,9 +558,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): # -------------------------------------------------------------------- - def _is_comparable_dtype(self, dtype): - return self.categories._is_comparable_dtype(dtype) - def take_nd(self, *args, **kwargs): """Alias for `take`""" warnings.warn( @@ -636,19 +637,11 @@ def map(self, mapper): mapped = self._values.map(mapper) return Index(mapped, name=self.name) - def _concat(self, to_concat: List["Index"], name: Label) -> Index: + def _concat(self, to_concat: List["Index"], name: Label) -> "CategoricalIndex": # if calling index is category, don't check dtype of others - try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - except TypeError: - # not all to_concat elements are among our categories (or NA) - from pandas.core.dtypes.concat import concat_compat - - res = concat_compat(to_concat) - return Index(res, name=name) - else: - cat = self._data._from_backing_data(codes) - return type(self)._simple_new(cat, name=name) + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=name) def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f0d4d36531e0d..1b18f04ba603d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -200,7 +200,7 @@ def __contains__(self, key: Any) -> bool: @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) indices = np.asarray(indices, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) @@ -686,19 +686,10 @@ def intersection(self, other, sort=False): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) - return self._intersection(other, sort=sort) - - def _intersection(self, other: Index, sort=False) -> Index: - """ - intersection specialized to the case with matching dtypes. - """ if len(self) == 0: return self.copy()._get_reconciled_name_object(other) if len(other) == 0: @@ -706,14 +697,17 @@ def _intersection(self, other: Index, sort=False) -> Index: if not isinstance(other, type(self)): result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + # TODO: no tests rely on this; needed? + result = result._with_freq("infer") return result elif not self._can_fast_intersect(other): - result = Index._intersection(self, other, sort=sort) - # We need to invalidate the freq because Index._intersection + result = Index.intersection(self, other, sort=sort) + # We need to invalidate the freq because Index.intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. - result = self._wrap_setop_result(other, result) return result._with_freq(None)._with_freq("infer") # to make our life easier, "sort" the two ranges diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8329c41a74596..f6eeb121b1ac0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -227,7 +227,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _is_numeric_dtype = False _data: DatetimeArray - inferred_freq: Optional[str] tz: Optional[tzinfo] # -------------------------------------------------------------------- @@ -338,7 +337,7 @@ def __reduce__(self): # we use a special reduce here because we need # to simply set the .tz (and not reinterpret it) - d = {"data": self._data} + d = dict(data=self._data) d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 92bd82f8263e9..3f146e273326c 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -273,7 +273,7 @@ def _get_engine_target(self) -> np.ndarray: return np.asarray(self._data) def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) result = self._data.repeat(repeats, axis=axis) return type(self)._simple_new(result, name=self.name) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2f86d9c20bfe8..ed92b3dade6a0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -11,7 +11,7 @@ from pandas._libs import lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset -from pandas._typing import AnyArrayLike, DtypeObj, Label +from pandas._typing import AnyArrayLike, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -38,7 +38,6 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs @@ -51,7 +50,6 @@ default_pprint, ensure_index, maybe_extract_name, - unpack_nested_dtype, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.extension import ExtensionIndex, inherit_names @@ -124,9 +122,8 @@ def setop_check(method): @wraps(method) def wrapped(self, other, sort=False): - self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + other = ensure_index(other) if not isinstance(other, IntervalIndex): result = getattr(self.astype(object), op_name)(other) @@ -134,6 +131,14 @@ def wrapped(self, other, sort=False): result = result.astype(self.dtype) return result + if self._is_non_comparable_own_type(other): + # GH#19016: ensure set op will not return a prohibited dtype + raise TypeError( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side " + "and have compatible dtypes" + ) + return method(self, other, sort) return wrapped @@ -805,19 +810,6 @@ def _convert_list_indexer(self, keyarr): return locs - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - if not isinstance(dtype, IntervalDtype): - return False - common_subtype = find_common_type([self.dtype.subtype, dtype.subtype]) - return not is_object_dtype(common_subtype) - - def _should_compare(self, other) -> bool: - if not super()._should_compare(other): - return False - other = unpack_nested_dtype(other) - return other.closed == self.closed - - # TODO: use should_compare and get rid of _is_non_comparable_own_type def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: # different closed or incompatible subtype -> no matches @@ -825,7 +817,8 @@ def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: # is_comparable_dtype GH#19371 if self.closed != other.closed: return True - return not self._is_comparable_dtype(other.dtype) + common_subtype = find_common_type([self.dtype.subtype, other.dtype.subtype]) + return is_object_dtype(common_subtype) # -------------------------------------------------------------------- @@ -963,37 +956,11 @@ def _format_space(self) -> str: # -------------------------------------------------------------------- # Set Operations - def _assert_can_do_setop(self, other): - super()._assert_can_do_setop(other) - - if isinstance(other, IntervalIndex) and self._is_non_comparable_own_type(other): - # GH#19016: ensure set op will not return a prohibited dtype - raise TypeError( - "can only do set operations between two IntervalIndex " - "objects that are closed on the same side " - "and have compatible dtypes" - ) - @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False) -> Index: - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) - - if self.equals(other) and not self.has_duplicates: - return self._get_reconciled_name_object(other) - - if not isinstance(other, IntervalIndex): - return self.astype(object).intersection(other) - - result = self._intersection(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _intersection(self, other, sort): - """ - intersection specialized to the case with matching dtypes. - """ - # For IntervalIndex we also know other.closed == self.closed + @setop_check + def intersection( + self, other: "IntervalIndex", sort: bool = False + ) -> "IntervalIndex": if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: @@ -1007,7 +974,7 @@ def _intersection(self, other, sort): if sort is None: taken = taken.sort_values() - return taken + return self._wrap_setop_result(other, taken) def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ @@ -1060,10 +1027,6 @@ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": def _setop(op_name: str, sort=None): def func(self, other, sort=sort): - # At this point we are assured - # isinstance(other, IntervalIndex) - # other.closed == self.closed - result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) @@ -1078,7 +1041,7 @@ def func(self, other, sort=sort): func.__name__ = op_name return setop_check(func) - _union = _setop("union") + union = _setop("union") difference = _setop("difference") symmetric_difference = _setop("symmetric_difference") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd47c23b7c92b..9b4b459d9a122 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -20,7 +20,7 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, DtypeObj, Label, Scalar, Shape +from pandas._typing import AnyArrayLike, Label, Scalar, Shape from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -1684,6 +1684,10 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) + def to_frame(self, index=True, name=None): """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -2165,8 +2169,7 @@ def drop(self, codes, level=None, errors="raise"): if isinstance(loc, int): inds.append(loc) elif isinstance(loc, slice): - step = loc.step if loc.step is not None else 1 - inds.extend(range(loc.start, loc.stop, step)) + inds.extend(range(loc.start, loc.stop)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( @@ -2527,10 +2530,6 @@ def _get_values_for_loc(self, series: "Series", loc, key): if is_scalar(loc): return new_values - if len(new_values) == 1 and not self.nlevels > 1: - # If more than one level left, we can not return a scalar - return new_values[0] - new_index = self[loc] new_index = maybe_droplevels(new_index, key) new_ser = series._constructor(new_values, index=new_index, name=series.name) @@ -3079,11 +3078,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set - if step is not None and step < 0: - # Switch elements for negative step size - start, stop = stop - 1, start - 1 - r = np.arange(start, stop, step) + r = np.arange(start, stop, step) if indexer is not None and len(indexer) != len(codes): # we have an indexer which maps the locations in the labels @@ -3346,8 +3342,6 @@ def _reorder_indexer( k_codes = k_codes[k_codes >= 0] # Filter absent keys # True if the given codes are not ordered need_sort = (k_codes[:-1] > k_codes[1:]).any() - elif isinstance(k, slice) and k.step is not None and k.step < 0: - need_sort = True # Bail out if both index and seq are sorted if not need_sort: return indexer @@ -3374,8 +3368,6 @@ def _reorder_indexer( key_order_map[level_indexer] = np.arange(len(level_indexer)) new_order = key_order_map[self.codes[i][indexer]] - elif isinstance(k, slice) and k.step is not None and k.step < 0: - new_order = np.arange(n)[k][indexer] elif isinstance(k, slice) and k.start is None and k.stop is None: # slice(None) should not determine order GH#31330 new_order = np.ones((n,))[indexer] @@ -3569,11 +3561,6 @@ def union(self, other, sort=None): if len(other) == 0 or self.equals(other): return self.rename(result_names) - return self._union(other, sort=sort) - - def _union(self, other, sort): - other, result_names = self._convert_can_do_setop(other) - # TODO: Index.union returns other when `len(self)` is 0. if not is_object_dtype(other.dtype): @@ -3588,9 +3575,6 @@ def _union(self, other, sort): zip(*uniq_tuples), sortorder=0, names=result_names ) - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - return is_object_dtype(dtype) - def intersection(self, other, sort=False): """ Form the intersection of two MultiIndex objects. @@ -3617,18 +3601,17 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): - if self.has_duplicates: - return self.unique().rename(result_names) - return self._get_reconciled_name_object(other) - - return self._intersection(other, sort=sort) - - def _intersection(self, other, sort=False): - other, result_names = self._convert_can_do_setop(other) + return self.rename(result_names) - if not self._is_comparable_dtype(other.dtype): + if not is_object_dtype(other.dtype): # The intersection is empty - return self[:0].rename(result_names) + # TODO: we have no tests that get here + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) lvals = self._values rvals = other._values @@ -3636,12 +3619,10 @@ def _intersection(self, other, sort=False): uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: - inner_tuples = self._inner_indexer(lvals, rvals)[0] - sort = False # inner_tuples is already sorted + uniq_tuples = self._inner_indexer(lvals, rvals)[0] + sort = False # uniq_tuples is already sorted except TypeError: pass - else: - uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: other_uniq = set(rvals) @@ -3732,14 +3713,16 @@ def _convert_can_do_setop(self, other): if not isinstance(other, Index): if len(other) == 0: - return self[:0], self.names + other = MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + verify_integrity=False, + ) else: msg = "other must be a MultiIndex or a list of tuples" try: - other = MultiIndex.from_tuples(other, names=self.names) - except (ValueError, TypeError) as err: - # ValueError raised by tuples_to_object_array if we - # have non-object dtype + other = MultiIndex.from_tuples(other) + except TypeError as err: raise TypeError(msg) from err else: result_names = get_unanimous_names(self, other) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ed76e26a57634..12f61fc44582d 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -4,7 +4,7 @@ import numpy as np from pandas._libs import index as libindex, lib -from pandas._typing import Dtype, DtypeObj, Label +from pandas._typing import Dtype, Label from pandas.util._decorators import doc from pandas.core.dtypes.cast import astype_nansafe @@ -29,7 +29,7 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name -_num_index_shared_docs = {} +_num_index_shared_docs = dict() class NumericIndex(Index): @@ -148,10 +148,6 @@ def _convert_tolerance(self, tolerance, target): ) return tolerance - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - # If we ever have BoolIndex or ComplexIndex, this may need to be tightened - return is_numeric_dtype(dtype) - @classmethod def _assert_safe_casting(cls, data, subarr): """ @@ -228,12 +224,7 @@ def _union(self, other, sort): An Index instance can **only** contain hashable objects. """ -_int64_descr_args = { - "klass": "Int64Index", - "ltype": "integer", - "dtype": "int64", - "extra": "", -} +_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") class IntegerIndex(NumericIndex): @@ -295,12 +286,9 @@ class Int64Index(IntegerIndex): _default_dtype = np.dtype(np.int64) -_uint64_descr_args = { - "klass": "UInt64Index", - "ltype": "unsigned integer", - "dtype": "uint64", - "extra": "", -} +_uint64_descr_args = dict( + klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" +) class UInt64Index(IntegerIndex): @@ -326,12 +314,9 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) -_float64_descr_args = { - "klass": "Float64Index", - "dtype": "float64", - "ltype": "float", - "extra": "", -} +_float64_descr_args = dict( + klass="Float64Index", dtype="float64", ltype="float", extra="" +) class Float64Index(NumericIndex): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 26bba4653007f..5dff07ee4c6dd 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,5 +1,5 @@ from datetime import datetime, timedelta -from typing import Any +from typing import Any, cast import warnings import numpy as np @@ -43,7 +43,7 @@ from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"}) +_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) # --- Period index sketch @@ -452,10 +452,13 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if not self._should_compare(target): - return self._get_indexer_non_comparable(target, method, unique=True) - if isinstance(target, PeriodIndex): + if not self._is_comparable_dtype(target.dtype): + # i.e. target.freq != self.freq + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + target = target._get_engine_target() # i.e. target.asi8 self_index = self._int64index else: @@ -636,19 +639,15 @@ def _setop(self, other, sort, opname: str): def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + other = ensure_index(other) if self.equals(other): return self._get_reconciled_name_object(other) - return self._intersection(other, sort=sort) - - def _intersection(self, other, sort=False): - - if is_object_dtype(other.dtype): + elif is_object_dtype(other.dtype): return self.astype("O").intersection(other, sort=sort) - elif not self._is_comparable_dtype(other.dtype): + elif not is_dtype_equal(self.dtype, other.dtype): # We can infer that the intersection is empty. # assert_can_do_setop ensures that this is not just a mismatched freq this = self[:0].astype("O") @@ -660,14 +659,14 @@ def _intersection(self, other, sort=False): def difference(self, other, sort=None): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) + other = ensure_index(other) if self.equals(other): - return self[:0].rename(result_name) - - return self._difference(other, sort=sort) + # pass an empty PeriodArray with the appropriate dtype - def _difference(self, other, sort): + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + values = cast(PeriodArray, self._data[:0]) + return type(self)._simple_new(values, name=self.name) if is_object_dtype(other): return self.astype(object).difference(other).astype(self.dtype) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ec896d94a20ba..669bf115df104 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -482,11 +482,34 @@ def equals(self, other: object) -> bool: # -------------------------------------------------------------------- # Set Operations - def _intersection(self, other, sort=False): + def intersection(self, other, sort=False): + """ + Form the intersection of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Sort the resulting index if possible + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + Returns + ------- + intersection : Index + """ + self._validate_sort_keyword(sort) + + if self.equals(other): + return self._get_reconciled_name_object(other) if not isinstance(other, RangeIndex): - # Int64Index - return super()._intersection(other, sort=sort) + return super().intersection(other, sort=sort) if not len(self) or not len(other): return self._simple_new(_empty_range) @@ -528,7 +551,7 @@ def _intersection(self, other, sort=False): if sort is None: new_index = new_index.sort_values() - return new_index + return self._wrap_setop_result(other, new_index) def _min_fitting_element(self, lower_limit: int) -> int: """Returns the smallest element greater than or equal to the limit""" @@ -629,8 +652,6 @@ def _union(self, other, sort): def difference(self, other, sort=None): # optimized set operation if we have another RangeIndex self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) if not isinstance(other, RangeIndex): return super().difference(other, sort=sort) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e7cf8cae28b88..6aa031af64833 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -259,11 +259,10 @@ def loc(self) -> "_LocIndexer": e.g. ``[True, False, True]``. - An alignable boolean Series. The index of the key will be aligned before masking. - - An alignable Index. The Index of the returned selection will be the input. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) - See more at :ref:`Selection by Label `. + See more at :ref:`Selection by Label ` Raises ------ @@ -333,14 +332,6 @@ def loc(self) -> "_LocIndexer": max_speed shield sidewinder 7 8 - Index (same behavior as ``df.reindex``) - - >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] - max_speed shield - foo - cobra 1 2 - viper 4 5 - Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] @@ -672,12 +663,17 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - # GH#38148 - keys = self.obj.columns.union(key, sort=False) - - self.obj._mgr = self.obj._mgr.reindex_axis( - keys, axis=0, copy=False, consolidate=False, only_slice=True - ) + for i, k in enumerate(key): + if k not in self.obj: + if value is None: + self.obj[k] = np.nan + elif is_array_like(value) and value.ndim == 2: + # GH#37964 have to select columnwise in case of array + self.obj[k] = value[:, i] + elif is_list_like(value): + self.obj[k] = value[i] + else: + self.obj[k] = value def __setitem__(self, key, value): if isinstance(key, tuple): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fe07823a80783..74b5a184df95d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -700,6 +700,7 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, + coerce: bool = False, ) -> List["Block"]: """ attempt to coerce any object types to better types return a copy @@ -1261,7 +1262,6 @@ def interpolate( axis=axis, inplace=inplace, limit=limit, - limit_area=limit_area, downcast=downcast, ) # validate the interp method @@ -1288,7 +1288,6 @@ def _interpolate_with_fill( axis: int = 0, inplace: bool = False, limit: Optional[int] = None, - limit_area: Optional[str] = None, downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ @@ -1303,7 +1302,6 @@ def _interpolate_with_fill( method=method, axis=axis, limit=limit, - limit_area=limit_area, ) blocks = [self.make_block_same_class(values, ndim=self.ndim)] @@ -1542,7 +1540,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement)] + blocks = [self.make_block_same_class(new_values, placement=new_placement)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): @@ -2396,28 +2394,6 @@ def quantile(self, qs, interpolation="linear", axis=0): aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) return self.make_block_same_class(aware, ndim=res_blk.ndim) - def _check_ndim(self, values, ndim): - """ - ndim inference and validation. - - This is overriden by the DatetimeTZBlock to check the case of 2D - data (values.ndim == 2), which should only be allowed if ndim is - also 2. - The case of 1D array is still allowed with both ndim of 1 or 2, as - if the case for other EAs. Therefore, we are only checking - `values.ndim > ndim` instead of `values.ndim != ndim` as for - consolidated blocks. - """ - if ndim is None: - ndim = values.ndim - - if values.ndim > ndim: - raise ValueError( - "Wrong number of dimensions. " - f"values.ndim != ndim [{values.ndim} != {ndim}]" - ) - return ndim - class TimeDeltaBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -2530,12 +2506,12 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, + coerce: bool = False, ) -> List["Block"]: """ - attempt to cast any object types to better types return a copy of + attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! """ - # operate column-by-column def f(mask, val, idx): shape = val.shape @@ -2544,6 +2520,7 @@ def f(mask, val, idx): datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, copy=copy, ) if isinstance(values, np.ndarray): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 93ab207d8ce12..4cd7cc56144d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -267,7 +267,7 @@ def __getstate__(self): "0.14.1": { "axes": axes_array, "blocks": [ - {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + dict(values=b.values, mgr_locs=b.mgr_locs.indexer) for b in self.blocks ], } @@ -636,6 +636,7 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, + coerce: bool = False, ) -> "BlockManager": return self.apply( "convert", @@ -643,6 +644,7 @@ def convert( datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, ) def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager": @@ -1236,8 +1238,6 @@ def reindex_axis( limit=None, fill_value=None, copy: bool = True, - consolidate: bool = True, - only_slice: bool = False, ): """ Conform block manager to new index. @@ -1248,13 +1248,7 @@ def reindex_axis( ) return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=copy, - consolidate=consolidate, - only_slice=only_slice, + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy ) def reindex_indexer( @@ -1266,7 +1260,6 @@ def reindex_indexer( allow_dups: bool = False, copy: bool = True, consolidate: bool = True, - only_slice: bool = False, ) -> T: """ Parameters @@ -1279,8 +1272,6 @@ def reindex_indexer( copy : bool, default True consolidate: bool, default True Whether to consolidate inplace before reindexing. - only_slice : bool, default False - Whether to take views, not copies, along columns. pandas-indexer with -1's only. """ @@ -1304,9 +1295,7 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, fill_value=fill_value, only_slice=only_slice - ) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) else: new_blocks = [ blk.take_nd( diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e374ba435a0bd..52536583b9b0d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,13 +1,13 @@ """ Routines for filling missing data. """ -from functools import partial + from typing import Any, List, Optional, Set, Union import numpy as np from pandas._libs import algos, lib -from pandas._typing import ArrayLike, Axis, DtypeObj +from pandas._typing import DtypeObj from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from_array @@ -15,45 +15,57 @@ ensure_float64, is_integer_dtype, is_numeric_v_string_like, + is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna -def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray: +def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True - - Parameters - ---------- - arr : ArrayLike - values_to_mask: list, tuple, or scalar - - Returns - ------- - np.ndarray[bool] """ - # When called from Block.replace/replace_list, values_to_mask is a scalar - # known to be holdable by arr. - # When called from Series._single_replace, values_to_mask is tuple or list dtype, values_to_mask = infer_dtype_from_array(values_to_mask) - values_to_mask = np.array(values_to_mask, dtype=dtype) + + try: + values_to_mask = np.array(values_to_mask, dtype=dtype) + + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) na_mask = isna(values_to_mask) nonna = values_to_mask[~na_mask] - # GH 21977 - mask = np.zeros(arr.shape, dtype=bool) + mask = None for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass + if mask is None: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask = False + else: + mask = arr == x + + # if x is a string and arr is not, then we get False and we must + # expand the mask to size arr.shape + if is_scalar(mask): + mask = np.zeros(arr.shape, dtype=bool) else: - mask |= arr == x + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask |= False + else: + mask |= arr == x if na_mask.any(): - mask |= isna(arr) + if mask is None: + mask = isna(arr) + else: + mask |= isna(arr) + + # GH 21977 + if mask is None: + mask = np.zeros(arr.shape, dtype=bool) return mask @@ -528,92 +540,16 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat return P(x) -def _interpolate_with_limit_area( - values: ArrayLike, method: str, limit: Optional[int], limit_area: Optional[str] -) -> ArrayLike: - """ - Apply interpolation and limit_area logic to values along a to-be-specified axis. - - Parameters - ---------- - values: array-like - Input array. - method: str - Interpolation method. Could be "bfill" or "pad" - limit: int, optional - Index limit on interpolation. - limit_area: str - Limit area for interpolation. Can be "inside" or "outside" - - Returns - ------- - values: array-like - Interpolated array. - """ - - invalid = isna(values) - - if not invalid.all(): - first = find_valid_index(values, "first") - last = find_valid_index(values, "last") - - values = interpolate_2d( - values, - method=method, - limit=limit, - ) - - if limit_area == "inside": - invalid[first : last + 1] = False - elif limit_area == "outside": - invalid[:first] = invalid[last + 1 :] = False - - values[invalid] = np.nan - - return values - - def interpolate_2d( values, - method: str = "pad", - axis: Axis = 0, - limit: Optional[int] = None, - limit_area: Optional[str] = None, + method="pad", + axis=0, + limit=None, ): """ Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. - - Parameters - ---------- - values: array-like - Input array. - method: str, default "pad" - Interpolation method. Could be "bfill" or "pad" - axis: 0 or 1 - Interpolation axis - limit: int, optional - Index limit on interpolation. - limit_area: str, optional - Limit area for interpolation. Can be "inside" or "outside" - - Returns - ------- - values: array-like - Interpolated array. """ - if limit_area is not None: - return np.apply_along_axis( - partial( - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - orig_values = values transf = (lambda x: x) if axis == 0 else (lambda x: x.T) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 88662a4fabed8..80c4cd5b44a92 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -12,6 +12,7 @@ from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas.core.dtypes.common import ( get_dtype, is_any_int_dtype, @@ -283,7 +284,7 @@ def _get_values( """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the - # np.where call below + # maybe_upcast_putmask call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) @@ -291,12 +292,10 @@ def _get_values( dtype = values.dtype - datetimelike = False if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) - datetimelike = True dtype_ok = _na_ok_dtype(dtype) @@ -307,13 +306,13 @@ def _get_values( ) if skipna and (mask is not None) and (fill_value is not None): - if mask.any(): - if dtype_ok or datetimelike: - values = values.copy() - np.putmask(values, mask, fill_value) - else: - # np.where will promote if needed - values = np.where(~mask, values, fill_value) + values = values.copy() + if dtype_ok and mask.any(): + np.putmask(values, mask, fill_value) + + # promote if needed + else: + values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d8b5dba424cbf..2b159c607b0a0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -311,10 +311,7 @@ def should_reindex_frame_op( # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - # Intersection is always unique so we have to check the unique columns - left_uniques = left.columns.unique() - right_uniques = right.columns.unique() - if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): + if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): # TODO: is there a shortcut available when len(cols) == 0? return True diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 41d539564d91e..c855687552e82 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -30,7 +30,6 @@ from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna -from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -176,8 +175,8 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = ensure_wrapped_if_datetimelike(left) - rvalues = ensure_wrapped_if_datetimelike(right) + lvalues = maybe_upcast_datetimelike_array(left) + rvalues = maybe_upcast_datetimelike_array(right) rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): @@ -207,7 +206,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = ensure_wrapped_if_datetimelike(left) + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -332,7 +331,7 @@ def fill_bool(x, left=None): right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = ensure_wrapped_if_datetimelike(left) + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): @@ -401,6 +400,31 @@ def get_array_op(op): raise NotImplementedError(op_name) +def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: + """ + If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. + + Parameters + ---------- + obj : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + if isinstance(obj, np.ndarray): + if obj.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(obj) + if obj.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(obj) + + return obj + + def _maybe_upcast_for_op(obj, shape: Shape): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 4866905d32b83..96a691da38b99 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -62,11 +62,11 @@ def add_flex_arithmetic_methods(cls): flex_arith_method, flex_comp_method = _get_method_wrappers(cls) new_methods = _create_methods(cls, flex_arith_method, flex_comp_method) new_methods.update( - { - "multiply": new_methods["mul"], - "subtract": new_methods["sub"], - "divide": new_methods["div"], - } + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) ) # opt out of bool flex methods for now assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) @@ -84,22 +84,22 @@ def _create_methods(cls, arith_method, comp_method): new_methods = {} new_methods.update( - { - "add": arith_method(operator.add), - "radd": arith_method(radd), - "sub": arith_method(operator.sub), - "mul": arith_method(operator.mul), - "truediv": arith_method(operator.truediv), - "floordiv": arith_method(operator.floordiv), - "mod": arith_method(operator.mod), - "pow": arith_method(operator.pow), - "rmul": arith_method(rmul), - "rsub": arith_method(rsub), - "rtruediv": arith_method(rtruediv), - "rfloordiv": arith_method(rfloordiv), - "rpow": arith_method(rpow), - "rmod": arith_method(rmod), - } + dict( + add=arith_method(operator.add), + radd=arith_method(radd), + sub=arith_method(operator.sub), + mul=arith_method(operator.mul), + truediv=arith_method(operator.truediv), + floordiv=arith_method(operator.floordiv), + mod=arith_method(operator.mod), + pow=arith_method(operator.pow), + rmul=arith_method(rmul), + rsub=arith_method(rsub), + rtruediv=arith_method(rtruediv), + rfloordiv=arith_method(rfloordiv), + rpow=arith_method(rpow), + rmod=arith_method(rmod), + ) ) new_methods["div"] = new_methods["truediv"] new_methods["rdiv"] = new_methods["rtruediv"] @@ -109,14 +109,14 @@ def _create_methods(cls, arith_method, comp_method): new_methods["rdivmod"] = arith_method(rdivmod) new_methods.update( - { - "eq": comp_method(operator.eq), - "ne": comp_method(operator.ne), - "lt": comp_method(operator.lt), - "gt": comp_method(operator.gt), - "le": comp_method(operator.le), - "ge": comp_method(operator.ge), - } + dict( + eq=comp_method(operator.eq), + ne=comp_method(operator.ne), + lt=comp_method(operator.lt), + gt=comp_method(operator.gt), + le=comp_method(operator.le), + ge=comp_method(operator.ge), + ) ) new_methods = {k.strip("_"): v for k, v in new_methods.items()} diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f0b1228a5340c..a2f25bbcf38d3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -43,7 +43,7 @@ from pandas.tseries.frequencies import is_subperiod, is_superperiod from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs: Dict[str, str] = {} +_shared_docs_kwargs: Dict[str, str] = dict() class Resampler(BaseGroupBy, ShallowMixin): diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index f49aaee8bbc00..bcdb223415813 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -22,7 +22,7 @@ from pandas import DataFrame, Series -@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) +@Appender(_shared_docs["melt"] % dict(caller="pd.melt(df, ", other="DataFrame.melt")) def melt( frame: "DataFrame", id_vars=None, @@ -42,7 +42,7 @@ def melt( if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " - "name of the resulting Dataframe. " + "name of the resultiing Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2c6cdb846221f..3b755c40721fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -114,8 +114,11 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - if all(item in right.columns for item in by): + + try: rby = right.groupby(by, sort=False) + except KeyError: + pass for key, lhs in lby: @@ -137,7 +140,9 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # make sure join keys are in the merged # TODO, should merge_pieces do this? - merged[by] = key + for k in by: + if k in merged: + merged[k] = key pieces.append(merged) @@ -271,20 +276,10 @@ def _merger(x, y): if left_by is not None and right_by is not None: raise ValueError("Can only group either left or right frames") elif left_by is not None: - if isinstance(left_by, str): - left_by = [left_by] - check = set(left_by).difference(left.columns) - if len(check) != 0: - raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: - if isinstance(right_by, str): - right_by = [right_by] - check = set(right_by).difference(right.columns) - if len(check) != 0: - raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( right_by, on, right, left, lambda x, y: _merger(y, x) ) @@ -1276,9 +1271,7 @@ def _validate_specification(self): raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - left_cols = self.left.columns - right_cols = self.right.columns - common_cols = left_cols.intersection(right_cols) + common_cols = self.left.columns.intersection(self.right.columns) if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " @@ -1287,10 +1280,7 @@ def _validate_specification(self): f"left_index={self.left_index}, " f"right_index={self.right_index}" ) - if ( - not left_cols.join(common_cols, how="inner").is_unique - or not right_cols.join(common_cols, how="inner").is_unique - ): + if not common_cols.is_unique: raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 40496a5b8671b..c1198cdfcda81 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,7 +5,6 @@ List, Optional, Sequence, - Set, Tuple, Union, cast, @@ -268,13 +267,19 @@ def _add_margins( margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names - # check the result column and leave floats - for dtype in set(result.dtypes): - cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].apply( - maybe_downcast_to_dtype, args=(dtype,) - ) - result = result.append(margin_dummy) + try: + # check the result column and leave floats + for dtype in set(result.dtypes): + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result.append(margin_dummy) + except TypeError: + + # we cannot reshape, so coerce the axis + result.index = result.index._to_safe_for_reshape() + result = result.append(margin_dummy) result.index.names = row_names return result @@ -322,7 +327,16 @@ def _all_key(key): # we are going to mutate this, so need to copy! piece = piece.copy() - piece[all_key] = margin[key] + try: + piece[all_key] = margin[key] + except ValueError: + # we cannot reshape, so coerce the axis + piece.set_axis( + piece._get_axis(cat_axis)._to_safe_for_reshape(), + axis=cat_axis, + inplace=True, + ) + piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) @@ -564,37 +578,29 @@ def crosstab( b 0 1 0 c 0 0 0 """ - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") - - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") - index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") + common_idx = None pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - rownames = _get_names(index, rownames, prefix="row") - colnames = _get_names(columns, colnames, prefix="col") + data: Dict = {} + data.update(zip(rownames, index)) + data.update(zip(colnames, columns)) - # duplicate names mapped to unique names for pivot op - ( - rownames_mapper, - unique_rownames, - colnames_mapper, - unique_colnames, - ) = _build_names_mapper(rownames, colnames) + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame - data = { - **dict(zip(unique_rownames, index)), - **dict(zip(unique_colnames, columns)), - } df = DataFrame(data, index=common_idx) original_df_cols = df.columns @@ -607,8 +613,8 @@ def crosstab( table = df.pivot_table( ["__dummy__"], - index=unique_rownames, - columns=unique_colnames, + index=rownames, + columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, @@ -627,9 +633,6 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) - table = table.rename_axis(index=rownames_mapper, axis=0) - table = table.rename_axis(columns=colnames_mapper, axis=1) - return table @@ -728,57 +731,3 @@ def _get_names(arrs, names, prefix: str = "row"): names = list(names) return names - - -def _build_names_mapper( - rownames: List[str], colnames: List[str] -) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: - """ - Given the names of a DataFrame's rows and columns, returns a set of unique row - and column names and mappers that convert to original names. - - A row or column name is replaced if it is duplicate among the rows of the inputs, - among the columns of the inputs or between the rows and the columns. - - Paramters - --------- - rownames: list[str] - colnames: list[str] - - Returns - ------- - Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) - - rownames_mapper: dict[str, str] - a dictionary with new row names as keys and original rownames as values - unique_rownames: list[str] - a list of rownames with duplicate names replaced by dummy names - colnames_mapper: dict[str, str] - a dictionary with new column names as keys and original column names as values - unique_colnames: list[str] - a list of column names with duplicate names replaced by dummy names - - """ - - def get_duplicates(names): - seen: Set = set() - return {name for name in names if name not in seen} - - shared_names = set(rownames).intersection(set(colnames)) - dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names - - rownames_mapper = { - f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names - } - unique_rownames = [ - f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) - ] - - colnames_mapper = { - f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names - } - unique_colnames = [ - f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) - ] - - return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames diff --git a/pandas/core/series.py b/pandas/core/series.py index b20cf8eed9a2e..d493ac0a8c051 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -84,13 +84,7 @@ from pandas.core.generic import NDFrame from pandas.core.indexers import deprecate_ndim_indexing, unpack_1tuple from pandas.core.indexes.accessors import CombinedDatetimelikeProperties -from pandas.core.indexes.api import ( - CategoricalIndex, - Float64Index, - Index, - MultiIndex, - ensure_index, -) +from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex @@ -111,21 +105,21 @@ __all__ = ["Series"] -_shared_doc_kwargs = { - "axes": "index", - "klass": "Series", - "axes_single_arg": "{0 or 'index'}", - "axis": """axis : {0 or 'index'} +_shared_doc_kwargs = dict( + axes="index", + klass="Series", + axes_single_arg="{0 or 'index'}", + axis="""axis : {0 or 'index'} Parameter needed for compatibility with DataFrame.""", - "inplace": """inplace : boolean, default False + inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - "unique": "np.ndarray", - "duplicated": "Series", - "optional_by": "", - "optional_mapper": "", - "optional_labels": "", - "optional_axis": "", -} + unique="np.ndarray", + duplicated="Series", + optional_by="", + optional_mapper="", + optional_labels="", + optional_axis="", +) def _coerce_method(converter): @@ -418,13 +412,7 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: labels = ensure_index(labels) if labels._is_all_dates: - deep_labels = labels - if isinstance(labels, CategoricalIndex): - deep_labels = labels.categories - - if not isinstance( - deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ): + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: labels = DatetimeIndex(labels) # need to set here because we changed the index @@ -916,8 +904,7 @@ def _get_values(self, indexer): except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack - # the asarray is needed to avoid returning a 2D DatetimeArray - return np.asarray(self._values[indexer]) + return self._values[indexer] def _get_value(self, label, takeable: bool = False): """ @@ -1114,7 +1101,7 @@ def repeat(self, repeats, axis=None) -> "Series": 2 c dtype: object """ - nv.validate_repeat(tuple(), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) return self._constructor(new_values, index=new_index).__finalize__( @@ -4719,7 +4706,6 @@ def _convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, - convert_floating: bool = True, ) -> "Series": input_series = self if infer_objects: @@ -4727,13 +4713,9 @@ def _convert_dtypes( if is_object_dtype(input_series): input_series = input_series.copy() - if convert_string or convert_integer or convert_boolean or convert_floating: + if convert_string or convert_integer or convert_boolean: inferred_dtype = convert_dtypes( - input_series._values, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + input_series._values, convert_string, convert_integer, convert_boolean ) try: result = input_series.astype(inferred_dtype) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 3aeb3b664b27f..9de9d1f434a12 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -1,6 +1,6 @@ from typing import Dict -_shared_docs: Dict[str, str] = {} +_shared_docs: Dict[str, str] = dict() _shared_docs[ "aggregate" diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0a1cbc6de1cda..729f517c789a7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: from pandas import MultiIndex + from pandas.core.arrays import ExtensionArray from pandas.core.indexes.base import Index _INT64_MAX = np.iinfo(np.int64).max @@ -390,7 +391,7 @@ def nargsort( return indexer -def nargminmax(values, method: str): +def nargminmax(values: "ExtensionArray", method: str) -> int: """ Implementation of np.argmin/argmax but for ExtensionArray and which handles missing values. @@ -405,16 +406,20 @@ def nargminmax(values, method: str): int """ assert method in {"argmax", "argmin"} - func = np.argmax if method == "argmax" else np.argmin - mask = np.asarray(isna(values)) - values = values._values_for_argsort() + mask = np.asarray(values.isna()) + if mask.all(): + # Use same exception message we would get from numpy + raise ValueError(f"attempt to get {method} of an empty sequence") - idx = np.arange(len(values)) - non_nans = values[~mask] - non_nan_idx = idx[~mask] + if method == "argmax": + # Use argsort with ascending=False so that if more than one entry + # achieves the maximum, we take the first such occurence. + sorters = values.argsort(ascending=False) + else: + sorters = values.argsort(ascending=True) - return non_nan_idx[func(non_nans)] + return sorters[0] def _ensure_key_mapped_multiindex( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 2713b76189157..9d16beba669ca 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -26,7 +26,7 @@ from pandas.core.base import NoNewAttributesMixin -_shared_docs: Dict[str, str] = {} +_shared_docs: Dict[str, str] = dict() _cpython_optimized_encoders = ( "utf-8", "utf8", @@ -1446,17 +1446,17 @@ def pad(self, width, side="left", fillchar=" "): filled : Series/Index of objects. """ - @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) @forbid_nonstring_types(["bytes"]) def center(self, width, fillchar=" "): return self.pad(width, side="both", fillchar=fillchar) - @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) @forbid_nonstring_types(["bytes"]) def ljust(self, width, fillchar=" "): return self.pad(width, side="right", fillchar=fillchar) - @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) @forbid_nonstring_types(["bytes"]) def rjust(self, width, fillchar=" "): return self.pad(width, side="left", fillchar=fillchar) @@ -1790,11 +1790,9 @@ def encode(self, encoding, errors="strict"): @Appender( _shared_docs["str_strip"] - % { - "side": "left and right sides", - "method": "strip", - "position": "leading and trailing", - } + % dict( + side="left and right sides", method="strip", position="leading and trailing" + ) ) @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): @@ -1803,7 +1801,7 @@ def strip(self, to_strip=None): @Appender( _shared_docs["str_strip"] - % {"side": "left side", "method": "lstrip", "position": "leading"} + % dict(side="left side", method="lstrip", position="leading") ) @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): @@ -1812,7 +1810,7 @@ def lstrip(self, to_strip=None): @Appender( _shared_docs["str_strip"] - % {"side": "right side", "method": "rstrip", "position": "trailing"} + % dict(side="right side", method="rstrip", position="trailing") ) @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): @@ -2414,11 +2412,11 @@ def extractall(self, pat, flags=0): @Appender( _shared_docs["find"] - % { - "side": "lowest", - "method": "find", - "also": "rfind : Return highest indexes in each strings.", - } + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) ) @forbid_nonstring_types(["bytes"]) def find(self, sub, start=0, end=None): @@ -2431,11 +2429,11 @@ def find(self, sub, start=0, end=None): @Appender( _shared_docs["find"] - % { - "side": "highest", - "method": "rfind", - "also": "find : Return lowest indexes in each strings.", - } + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) ) @forbid_nonstring_types(["bytes"]) def rfind(self, sub, start=0, end=None): @@ -2497,12 +2495,12 @@ def normalize(self, form): @Appender( _shared_docs["index"] - % { - "side": "lowest", - "similar": "find", - "method": "index", - "also": "rindex : Return highest indexes in each strings.", - } + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) ) @forbid_nonstring_types(["bytes"]) def index(self, sub, start=0, end=None): @@ -2515,12 +2513,12 @@ def index(self, sub, start=0, end=None): @Appender( _shared_docs["index"] - % { - "side": "highest", - "similar": "rfind", - "method": "rindex", - "also": "index : Return lowest indexes in each strings.", - } + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) ) @forbid_nonstring_types(["bytes"]) def rindex(self, sub, start=0, end=None): @@ -2655,24 +2653,18 @@ def len(self): # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} - _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} - _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} - _doc_args["capitalize"] = { - "type": "be capitalized", - "method": "capitalize", - "version": "", - } - _doc_args["swapcase"] = { - "type": "be swapcased", - "method": "swapcase", - "version": "", - } - _doc_args["casefold"] = { - "type": "be casefolded", - "method": "casefold", - "version": "\n .. versionadded:: 0.25.0\n", - } + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) @forbid_nonstring_types(["bytes"]) @@ -2852,15 +2844,15 @@ def casefold(self): 3 False dtype: bool """ - _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} - _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} - _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} - _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} - _doc_args["islower"] = {"type": "lowercase", "method": "islower"} - _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} - _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} - _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} - _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) isalnum = _map_and_wrap( diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index ed920c174ea69..1dd005c1602a5 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -9,7 +9,7 @@ from pandas.errors import NumbaUtilError GLOBAL_USE_NUMBA: bool = False -NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = {} +NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict() def maybe_use_numba(engine: Optional[str]) -> bool: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e6185f8ae0679..51a1e2102c273 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -50,6 +50,7 @@ from pandas.core.aggregation import aggregate from pandas.core.base import DataError, SelectionMixin +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex @@ -790,29 +791,22 @@ def _apply( # Our result will have still kept the column in the result result = result.drop(columns=column_keys, errors="ignore") - codes = self._groupby.grouper.codes - levels = self._groupby.grouper.levels - - group_indices = self._groupby.grouper.indices.values() - if group_indices: - indexer = np.concatenate(list(group_indices)) - else: - indexer = np.array([], dtype=np.intp) - codes = [c.take(indexer) for c in codes] - - # if the index of the original dataframe needs to be preserved, append - # this index (but reordered) to the codes/levels from the groupby - if grouped_object_index is not None: - idx = grouped_object_index.take(indexer) - if not isinstance(idx, MultiIndex): - idx = MultiIndex.from_arrays([idx]) - codes.extend(list(idx.codes)) - levels.extend(list(idx.levels)) - - result_index = MultiIndex( - levels, codes, names=result_index_names, verify_integrity=False + result_index_data = [] + for key, values in self._groupby.grouper.indices.items(): + for value in values: + data = [ + *com.maybe_make_list(key), + *com.maybe_make_list( + grouped_object_index[value] + if grouped_object_index is not None + else [] + ), + ] + result_index_data.append(tuple(data)) + + result_index = MultiIndex.from_tuples( + result_index_data, names=result_index_names ) - result.index = result_index return result diff --git a/pandas/io/common.py b/pandas/io/common.py index 9fede5180e727..8ec0a869c7042 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -7,6 +7,7 @@ from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper import mmap import os +import pathlib from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast from urllib.parse import ( urljoin, @@ -175,8 +176,19 @@ def stringify_path( Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - if isinstance(filepath_or_buffer, os.PathLike): - filepath_or_buffer = filepath_or_buffer.__fspath__() + if hasattr(filepath_or_buffer, "__fspath__"): + # https://github.com/python/mypy/issues/1424 + # error: Item "str" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[str]" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "str" of "Union[str, Path, IO[bytes]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no + # attribute "__fspath__" [union-attr] + filepath_or_buffer = filepath_or_buffer.__fspath__() # type: ignore[union-attr] + elif isinstance(filepath_or_buffer, pathlib.Path): + filepath_or_buffer = str(filepath_or_buffer) return _expand_user(filepath_or_buffer) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 626c3df196380..c519baa4c21da 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,17 +1,14 @@ import abc import datetime -import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill from typing import Any, Dict, Mapping, Union, cast -import warnings from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions -from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -102,32 +99,12 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : - - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. - - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will - be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. - converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -576,9 +553,6 @@ class ExcelWriter(metaclass=abc.ABCMeta): Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. See DataFrame.to_excel for typical usage. - The writer should be used as a context manager. Otherwise, call `close()` to save - and close any opened file handles. - Parameters ---------- path : str or typing.BinaryIO @@ -903,32 +877,13 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, + default ``xlrd``. Engine compatibility : - - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. - - .. versionchanged:: 1.2.0 - - The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` - will be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. """ from pandas.io.excel._odfreader import ODFReader @@ -947,59 +902,14 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - # Determine ext and use odf for ods stream/file + engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" - - if ( - import_optional_dependency( - "xlrd", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - from xlrd import Book - - if isinstance(path_or_buffer, Book): - engine = "xlrd" - - # GH 35029 - Prefer openpyxl except for xls files - if engine is None: - if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": - engine = "xlrd" - elif ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - engine = "openpyxl" - else: - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - "The xlrd engine is no longer maintained and is not " - "supported when using pandas with python >= 3.9. However, " - "the engine xlrd will continue to be allowed for the " - "indefinite future. Beginning with pandas 1.2.0, the " - "openpyxl engine will be used if it is installed and the " - "engine argument is not specified. Either install openpyxl " - "or specify engine='xlrd' to silence this warning.", - FutureWarning, - stacklevel=stacklevel, - ) - engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 9a725c15de61e..9ede7cd0c2b95 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -45,9 +45,7 @@ def save(self): """ Save workbook to disk. """ - if self.sheets: - # fails when the ExcelWriter is just opened and then closed - self.book.save(self.handles.handle) + self.book.save(self.handles.handle) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6d14d6172aa6c..fbda78a1842ca 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -159,13 +159,13 @@ def _initialize_chunksize(self, chunksize: Optional[int]) -> int: @property def _number_format(self) -> Dict[str, Any]: """Dictionary used for storing number formatting settings.""" - return { - "na_rep": self.na_rep, - "float_format": self.float_format, - "date_format": self.date_format, - "quoting": self.quoting, - "decimal": self.decimal, - } + return dict( + na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format, + quoting=self.quoting, + decimal=self.decimal, + ) @property def data_index(self) -> Index: diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index f6f3571955e6e..0212fd6f695cb 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -153,11 +153,11 @@ def pad_empties(x): break return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] - gen = (pad_empties(i) for i in out) + out = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels - out = [[" " * len(i[-1])] * clevels + i for i in gen] + out = [[" " * len(i[-1])] * clevels + i for i in out] # Add the column names to the last index column cnames = self.frame.columns.names diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 128e50d84657c..ac453839792f3 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -206,7 +206,7 @@ def as_escaped_string( translate = escape_chars escape_chars = list(escape_chars.keys()) else: - escape_chars = escape_chars or () + escape_chars = escape_chars or tuple() result = str(thing) for c in escape_chars: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 4557c10927a15..0eeff44d0f74c 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -433,16 +433,16 @@ def format_attr(pair): else: table_attr += ' class="tex2jax_ignore"' - return { - "head": head, - "cellstyle": cellstyle, - "body": body, - "uuid": uuid, - "precision": precision, - "table_styles": table_styles, - "caption": caption, - "table_attributes": table_attr, - } + return dict( + head=head, + cellstyle=cellstyle, + body=body, + uuid=uuid, + precision=precision, + table_styles=table_styles, + caption=caption, + table_attributes=table_attr, + ) def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler": """ diff --git a/pandas/io/html.py b/pandas/io/html.py index 4a2d4af62f3e9..334a3dab6c13a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -794,8 +794,9 @@ def _data_to_frame(**kwargs): # fill out elements of body that are "ragged" _expand_elements(body) - with TextParser(body, header=header, **kwargs) as tp: - return tp.read() + tp = TextParser(body, header=header, **kwargs) + df = tp.read() + return df _valid_parsers = { diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index da085d0d0eb2f..e1feb1aa3fada 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -437,10 +437,6 @@ def read_json( This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - .. versionchanged:: 1.2 - - ``JsonReader`` is a context manager. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -559,8 +555,7 @@ def read_json( if chunksize: return json_reader - with json_reader: - return json_reader.read() + return json_reader.read() class JsonReader(abc.Iterator): @@ -752,12 +747,6 @@ def __next__(self): self.close() raise StopIteration - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - class Parser: _split_keys: Tuple[str, ...] @@ -1109,7 +1098,7 @@ def _process_converter(self, f, filt=None): assert obj is not None # for mypy needs_new_obj = False - new_obj = {} + new_obj = dict() for i, (col, c) in enumerate(obj.items()): if filt(col, c): new_data, result = f(col, c) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8b1184df92eaf..a19b132a7891d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,6 +1,5 @@ """ parquet compat """ -from distutils.version import LooseVersion import io import os from typing import Any, AnyStr, Dict, List, Optional, Tuple @@ -178,39 +177,10 @@ def write( handles.close() def read( - self, - path, - columns=None, - use_nullable_dtypes=False, - storage_options: StorageOptions = None, - **kwargs, + self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} - if use_nullable_dtypes: - if LooseVersion(self.api.__version__) >= "0.16": - import pandas as pd - - mapping = { - self.api.int8(): pd.Int8Dtype(), - self.api.int16(): pd.Int16Dtype(), - self.api.int32(): pd.Int32Dtype(), - self.api.int64(): pd.Int64Dtype(), - self.api.uint8(): pd.UInt8Dtype(), - self.api.uint16(): pd.UInt16Dtype(), - self.api.uint32(): pd.UInt32Dtype(), - self.api.uint64(): pd.UInt64Dtype(), - self.api.bool_(): pd.BooleanDtype(), - self.api.string(): pd.StringDtype(), - } - to_pandas_kwargs["types_mapper"] = mapping.get - else: - raise ValueError( - "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " - f"({self.api.__version__} is installed" - ) - path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), @@ -220,7 +190,7 @@ def read( try: return self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs - ).to_pandas(**to_pandas_kwargs) + ).to_pandas() finally: if handles is not None: handles.close() @@ -288,12 +258,6 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine" - ) path = stringify_path(path) parquet_kwargs = {} handles = None @@ -404,13 +368,7 @@ def to_parquet( return None -def read_parquet( - path, - engine: str = "auto", - columns=None, - use_nullable_dtypes: bool = False, - **kwargs, -): +def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. @@ -439,15 +397,6 @@ def read_parquet( 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. - use_nullable_dtypes : bool, default False - If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). - As new dtypes are added that support ``pd.NA`` in the future, the - output with this option will change to use those dtypes. - Note: this is an experimental option, and behaviour (e.g. additional - support dtypes) may change without notice. - - .. versionadded:: 1.2.0 **kwargs Any additional kwargs are passed to the engine. @@ -456,6 +405,4 @@ def read_parquet( DataFrame """ impl = get_engine(engine) - return impl.read( - path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs - ) + return impl.read(path, columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5b623c360c3ef..25e8d9acf4690 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -276,19 +276,11 @@ iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs `_ for more information on ``iterator`` and ``chunksize``. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the @@ -459,8 +451,12 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if chunksize or iterator: return parser - with parser: - return parser.read(nrows) + try: + data = parser.read(nrows) + finally: + parser.close() + + return data _parser_defaults = { @@ -1078,12 +1074,6 @@ def get_chunk(self, size=None): size = min(size, self.nrows - self._currow) return self.read(nrows=size) - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - def _is_index_col(col): return col is not None and col is not False @@ -1891,11 +1881,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): # no attribute "mmap" [union-attr] self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] - try: - self._reader = parsers.TextReader(self.handles.handle, **kwds) - except Exception: - self.handles.close() - raise + self._reader = parsers.TextReader(self.handles.handle, **kwds) self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3fe251d300856..d7ee4acc2e670 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2037,7 +2037,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): val_kind = _ensure_decoded(self.kind) values = _maybe_convert(values, val_kind, encoding, errors) - kwargs = {} + kwargs = dict() kwargs["name"] = _ensure_decoded(self.index_name) if self.freq is not None: @@ -3237,7 +3237,7 @@ def __init__( self.non_index_axes = non_index_axes or [] self.values_axes = values_axes or [] self.data_columns = data_columns or [] - self.info = info or {} + self.info = info or dict() self.nan_rep = nan_rep @property @@ -3446,7 +3446,7 @@ def get_attrs(self): """ retrieve our attributes """ self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] self.data_columns = getattr(self.attrs, "data_columns", None) or [] - self.info = getattr(self.attrs, "info", None) or {} + self.info = getattr(self.attrs, "info", None) or dict() self.nan_rep = getattr(self.attrs, "nan_rep", None) self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) @@ -3596,7 +3596,7 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): if not isinstance(columns, (tuple, list)): columns = [columns] - kw = {} + kw = dict() if optlevel is not None: kw["optlevel"] = optlevel if kind is not None: @@ -3689,7 +3689,7 @@ def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): return [] axis, axis_labels = non_index_axes[0] - info = self.info.get(axis, {}) + info = self.info.get(axis, dict()) if info.get("type") == "MultiIndex" and data_columns: raise ValueError( f"cannot use a multi-index on axis [{axis}] with " @@ -4071,7 +4071,7 @@ def create_description( if expectedrows is None: expectedrows = max(self.nrows_expected, 10000) - d = {"name": "table", "expectedrows": expectedrows} + d = dict(name="table", expectedrows=expectedrows) # description from the axes & values d["description"] = {a.cname: a.typ for a in self.axes} @@ -4458,9 +4458,9 @@ def read( result = self._read_axes(where=where, start=start, stop=stop) info = ( - self.info.get(self.non_index_axes[0][0], {}) + self.info.get(self.non_index_axes[0][0], dict()) if len(self.non_index_axes) - else {} + else dict() ) inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 243218129fda6..3f0370209e9a8 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -26,12 +26,6 @@ def read(self, nrows=None): def close(self): pass - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - @overload def read_sas( @@ -91,17 +85,9 @@ def read_sas( Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. - Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader @@ -148,5 +134,4 @@ def read_sas( if iterator or chunksize: return reader - with reader: - return reader.read() + return reader.read() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6f296d3c8d92f..d97ba6183c955 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1464,7 +1464,7 @@ def _read_value_labels(self) -> None: off = off[ii] val = val[ii] txt = self.path_or_buf.read(txtlen) - self.value_label_dict[labname] = {} + self.value_label_dict[labname] = dict() for i in range(n): end = off[i + 1] if i < n - 1 else txtlen self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index ae4fff7b495d0..64cd43c230f28 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -1,7 +1,7 @@ # TODO: Use the fact that axis can have units to simplify the process import functools -from typing import TYPE_CHECKING, Optional, cast +from typing import TYPE_CHECKING, Optional import numpy as np @@ -26,7 +26,7 @@ if TYPE_CHECKING: from matplotlib.axes import Axes - from pandas import DatetimeIndex, Index, Series + from pandas import Index, Series # --------------------------------------------------------------------- # Plotting functions and monkey patches @@ -243,7 +243,6 @@ def maybe_convert_index(ax: "Axes", data): if freq is None: # We only get here for DatetimeIndex - data.index = cast("DatetimeIndex", data.index) freq = data.index.inferred_freq freq = to_offset(freq) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index f507c6d4f45fb..149389b936def 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -18,6 +18,18 @@ def id_func(x): # ------------------------------------------------------------------ +@pytest.fixture( + params=[ + ("foo", None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ] +) +def names(request): + """ + A 3-tuple of names, the first two for operands, the last for a result. + """ + return request.param @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 092a3f0d4402f..0202337a4389a 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -822,7 +822,7 @@ def test_operators_timedelta64(self): tm.assert_series_equal(rs, xp) assert rs.dtype == "timedelta64[ns]" - df = DataFrame({"A": v1}) + df = DataFrame(dict(A=v1)) td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == "timedelta64[ns]" diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 01de64568a011..1a4ab9799e8e5 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -5,7 +5,6 @@ import pandas as pd import pandas._testing as tm -from pandas.arrays import FloatingArray @pytest.fixture @@ -52,15 +51,13 @@ def test_sub(left_array, right_array): def test_div(left_array, right_array): + # for now division gives a float numpy array result = left_array / right_array - expected = FloatingArray( - np.array( - [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan], - dtype="float64", - ), - np.array([False, False, True, False, False, True, True, True, True]), + expected = np.array( + [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan], + dtype="float64", ) - tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 0f8743489b412..7665c350e3443 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -85,13 +85,6 @@ def test_value_counts_na(): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(): - s = pd.Series([True, False, pd.NA], dtype="boolean") - result = s.value_counts(normalize=True) - expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2 - tm.assert_series_equal(result, expected) - - def test_diff(): a = pd.array( [True, True, False, False, True, None, True, None, False], dtype="boolean" diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 36ed790eff63c..cb0ba128c1fb7 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -62,13 +62,13 @@ def test_set_item_nan(self): "fillna_kwargs, msg", [ ( - {"value": 1, "method": "ffill"}, + dict(value=1, method="ffill"), "Cannot specify both 'value' and 'method'.", ), - ({}, "Must specify a fill 'value' or 'method'."), - ({"method": "bad"}, "Invalid fill method. Expecting .* bad"), + (dict(), "Must specify a fill 'value' or 'method'."), + (dict(method="bad"), "Invalid fill method. Expecting .* bad"), ( - {"value": Series([1, 2, 3, 4, "a"])}, + dict(value=Series([1, 2, 3, 4, "a"])), "Cannot setitem on a Categorical with a new category", ), ], diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index ef95eac316397..baf60a363ad29 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -113,13 +113,6 @@ def test_value_counts_empty(): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(): - s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64") - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3 - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_floating_array_sum(skipna, min_count, dtype): diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 4b8d95ae83e4f..cf382dd5e37e0 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,11 +3,9 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p20 - import pandas as pd import pandas._testing as tm -from pandas.core.arrays import FloatingArray, integer_array +from pandas.core.arrays import integer_array import pandas.core.ops as ops # Basic test for the arithmetic array ops @@ -45,12 +43,13 @@ def test_sub(dtype): def test_div(dtype): + # for now division gives a float numpy array a = pd.array([1, 2, 3, None, 5], dtype=dtype) b = pd.array([0, 1, None, 3, 4], dtype=dtype) result = a / b - expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64") - tm.assert_extension_array_equal(result, expected) + expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64") + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) @@ -58,13 +57,10 @@ def test_divide_by_zero(zero, negative): # https://github.com/pandas-dev/pandas/issues/27398 a = pd.array([0, 1, -1, None], dtype="Int64") result = a / zero - expected = FloatingArray( - np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), - np.array([False, False, False, True]), - ) + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) if negative: expected *= -1 - tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_floordiv(dtype): @@ -101,11 +97,8 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = a ** np.nan - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), - np.array([False, False, False, True, False]), - ) - tm.assert_extension_array_equal(result, expected) + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) # reversed a = a[1:] # Can't raise integers to negative powers. @@ -123,11 +116,8 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = np.nan ** a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype="float64"), - np.array([False, False, True, False]), - ) - tm.assert_extension_array_equal(result, expected) + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) def test_pow_array(): @@ -141,10 +131,10 @@ def test_pow_array(): def test_rpow_one_to_na(): # https://github.com/pandas-dev/pandas/issues/22022 # https://github.com/pandas-dev/pandas/issues/29997 - arr = pd.array([np.nan, np.nan], dtype="Int64") + arr = integer_array([np.nan, np.nan]) result = np.array([1.0, 2.0]) ** arr - expected = pd.array([1.0, np.nan], dtype="Float64") - tm.assert_extension_array_equal(result, expected) + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("other", [0, 0.5]) @@ -206,19 +196,9 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): result = op(s, other) expected = op(s.astype(float), other) - expected = expected.astype("Float64") # rfloordiv results in nan instead of inf - if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20: - # for numpy 1.20 https://github.com/numpy/numpy/pull/16161 - # updated floordiv, now matches our behavior defined in core.ops - mask = ( - ((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool) - ) - expected.array._data[mask] = np.nan - # rmod results in NaN that wasn't NA in original nullable Series -> unmask it - elif all_arithmetic_operators == "__rmod__": - mask = (s == 0).fillna(False).to_numpy(bool) - expected.array._mask[mask] = False + if all_arithmetic_operators == "__rfloordiv__": + expected[(expected == np.inf) | (expected == -np.inf)] = np.nan tm.assert_series_equal(result, expected) @@ -231,7 +211,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other): s = pd.Series([1, 2, 3], dtype="Int64") result = op(s, other) - assert result.dtype == "Float64" + assert result.dtype is np.dtype("float") def test_cross_type_arithmetic(): diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 521547cc7357d..9cdea1c71f109 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -127,14 +127,6 @@ def test_value_counts_empty(): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(): - # GH 33172 - s = pd.Series([1, 2, 1, pd.NA], dtype="Int64") - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3 - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype): diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 1d2833c5da276..6de10fd896878 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -43,7 +43,11 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators): for scalar in [scalar, data.dtype.type(scalar)]: result = op(data, scalar) expected = op(data, scalar_array) - tm.assert_extension_array_equal(result, expected) + if isinstance(expected, ExtensionArray): + tm.assert_extension_array_equal(result, expected) + else: + # TODO div still gives float ndarray -> remove this once we have Float EA + tm.assert_numpy_array_equal(result, expected) def test_array_NA(data, all_arithmetic_operators): diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 61f4e3e50d09d..c9f1dd7f589fc 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p20 - import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -118,15 +116,9 @@ def _check_logical_ops(self, a, b, a_dense, b_dense): @pytest.mark.parametrize("scalar", [0, 1, 3]) @pytest.mark.parametrize("fill_value", [None, 0, 2]) def test_float_scalar( - self, kind, mix, all_arithmetic_functions, fill_value, scalar, request + self, kind, mix, all_arithmetic_functions, fill_value, scalar ): op = all_arithmetic_functions - - if not _np_version_under1p20: - if op in [operator.floordiv, ops.rfloordiv]: - mark = pytest.mark.xfail(strict=False, reason="GH#38172") - request.node.add_marker(mark) - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) a = self._klass(values, kind=kind, fill_value=fill_value) @@ -150,11 +142,15 @@ def test_float_scalar_comparison(self, kind): self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) - def test_float_same_index_without_nans( - self, kind, mix, all_arithmetic_functions, request - ): + def test_float_same_index(self, kind, mix, all_arithmetic_functions): # when sp_index are the same op = all_arithmetic_functions + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) @@ -163,24 +159,6 @@ def test_float_same_index_without_nans( b = self._klass(rvalues, kind=kind, fill_value=0) self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_float_same_index_with_nans( - self, kind, mix, all_arithmetic_functions, request - ): - # when sp_index are the same - op = all_arithmetic_functions - - if not _np_version_under1p20: - if op in [operator.floordiv, ops.rfloordiv]: - mark = pytest.mark.xfail(strict=False, reason="GH#38172") - request.node.add_marker(mark) - - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues, mix, op) - def test_float_same_index_comparison(self, kind): # when sp_index are the same values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) @@ -346,14 +324,9 @@ def test_bool_array_logical(self, kind, fill_value): b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) - def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request): + def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions): op = all_arithmetic_functions - if not _np_version_under1p20: - if op in [operator.floordiv, ops.rfloordiv] and mix: - mark = pytest.mark.xfail(strict=True, reason="GH#38172") - request.node.add_marker(mark) - rdtype = "int64" values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 992dff218415d..517dc4a2c3d8b 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -12,47 +12,42 @@ TEST_LENGTH = 20 -plain_case = { - "xloc": [0, 7, 15], - "xlen": [3, 5, 5], - "yloc": [2, 9, 14], - "ylen": [2, 3, 5], - "intersect_loc": [2, 9, 15], - "intersect_len": [1, 3, 4], -} -delete_blocks = { - "xloc": [0, 5], - "xlen": [4, 4], - "yloc": [1], - "ylen": [4], - "intersect_loc": [1], - "intersect_len": [3], -} -split_blocks = { - "xloc": [0], - "xlen": [10], - "yloc": [0, 5], - "ylen": [3, 7], - "intersect_loc": [0, 5], - "intersect_len": [3, 5], -} -skip_block = { - "xloc": [10], - "xlen": [5], - "yloc": [0, 12], - "ylen": [5, 3], - "intersect_loc": [12], - "intersect_len": [3], -} - -no_intersect = { - "xloc": [0, 10], - "xlen": [4, 6], - "yloc": [5, 17], - "ylen": [4, 2], - "intersect_loc": [], - "intersect_len": [], -} +plain_case = dict( + xloc=[0, 7, 15], + xlen=[3, 5, 5], + yloc=[2, 9, 14], + ylen=[2, 3, 5], + intersect_loc=[2, 9, 15], + intersect_len=[1, 3, 4], +) +delete_blocks = dict( + xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3] +) +split_blocks = dict( + xloc=[0], + xlen=[10], + yloc=[0, 5], + ylen=[3, 7], + intersect_loc=[0, 5], + intersect_len=[3, 5], +) +skip_block = dict( + xloc=[10], + xlen=[5], + yloc=[0, 12], + ylen=[5, 3], + intersect_loc=[12], + intersect_len=[3], +) + +no_intersect = dict( + xloc=[0, 10], + xlen=[4, 6], + yloc=[5, 17], + ylen=[4, 2], + intersect_loc=[], + intersect_len=[], +) def check_cases(_check_case): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c70d55b07661d..9a1634380aaba 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -366,15 +366,6 @@ def test_astype_int(dtype, request): tm.assert_extension_array_equal(result, expected) -def test_astype_float(any_float_allowed_nullable_dtype): - # Don't compare arrays (37974) - ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string") - - result = ser.astype(any_float_allowed_nullable_dtype) - expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): @@ -495,18 +486,6 @@ def test_value_counts_na(dtype, request): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(dtype, request): - if dtype == "arrow_string": - reason = "TypeError: boolean value of NA is ambiguous" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - - s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) - result = s.value_counts(normalize=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "values, expected", [ diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c489aa5867632..159f52a4c7c25 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import pytz from pandas._libs import NaT, OutOfBoundsDatetime, Timestamp from pandas.compat.numpy import np_version_under1p18 @@ -268,16 +269,18 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, request): + def test_searchsorted_castable_strings(self, arr1d, box): if isinstance(arr1d, DatetimeArray): tz = arr1d.tz - ts1, ts2 = arr1d[1:3] - if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2): + if ( + tz is not None + and tz is not pytz.UTC + and not isinstance(tz, pytz._FixedOffset) + ): # If we have e.g. tzutc(), when we cast to string and parse # back we get pytz.UTC, and then consider them different timezones # so incorrectly raise. - mark = pytest.mark.xfail(reason="timezone comparisons inconsistent") - request.node.add_marker(mark) + pytest.xfail(reason="timezone comparisons inconsistent") arr = arr1d if box is None: @@ -388,17 +391,19 @@ def test_setitem(self): expected[:2] = expected[-2:] tm.assert_numpy_array_equal(arr.asi8, expected) - def test_setitem_strs(self, arr1d, request): + def test_setitem_strs(self, arr1d): # Check that we parse strs in both scalar and listlike if isinstance(arr1d, DatetimeArray): tz = arr1d.tz - ts1, ts2 = arr1d[-2:] - if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2): + if ( + tz is not None + and tz is not pytz.UTC + and not isinstance(tz, pytz._FixedOffset) + ): # If we have e.g. tzutc(), when we cast to string and parse # back we get pytz.UTC, and then consider them different timezones # so incorrectly raise. - mark = pytest.mark.xfail(reason="timezone comparisons inconsistent") - request.node.add_marker(mark) + pytest.xfail(reason="timezone comparisons inconsistent") # Setting list-like of strs expected = arr1d.copy() diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index cc4aed5e4413d..a6fdb82e48197 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -316,35 +316,18 @@ def test_array_multiindex_raises(): TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), np.array([0, 3600000000000], dtype="m8[ns]"), ), - # GH#26406 tz is preserved in Categorical[dt64tz] - ( - pd.Categorical(pd.date_range("2016-01-01", periods=2, tz="US/Pacific")), - np.array( - [ - Timestamp("2016-01-01", tz="US/Pacific"), - Timestamp("2016-01-02", tz="US/Pacific"), - ] - ), - ), ], ) -def test_to_numpy(array, expected, index_or_series_or_array, request): - box = index_or_series_or_array +def test_to_numpy(array, expected, index_or_series): + box = index_or_series thing = box(array) if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: pytest.skip(f"No index type for {array.dtype}") - if array.dtype.name == "int64" and box is pd.array: - mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") - request.node.add_marker(mark) - result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) - result = np.asarray(thing) - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_convert_objects.py b/pandas/tests/dtypes/cast/test_convert_objects.py new file mode 100644 index 0000000000000..a28d554acd312 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_convert_objects.py @@ -0,0 +1,12 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_convert_objects + + +@pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]]) +def test_maybe_convert_objects_copy(data): + arr = np.array(data) + out = maybe_convert_objects(arr) + + assert arr is not out diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce6737db44195..2db9a9a403e1c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -105,16 +105,16 @@ def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) == dtype -dtypes = { - "datetime_tz": com.pandas_dtype("datetime64[ns, US/Eastern]"), - "datetime": com.pandas_dtype("datetime64[ns]"), - "timedelta": com.pandas_dtype("timedelta64[ns]"), - "period": PeriodDtype("D"), - "integer": np.dtype(np.int64), - "float": np.dtype(np.float64), - "object": np.dtype(object), - "category": com.pandas_dtype("category"), -} +dtypes = dict( + datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"), + datetime=com.pandas_dtype("datetime64[ns]"), + timedelta=com.pandas_dtype("timedelta64[ns]"), + period=PeriodDtype("D"), + integer=np.dtype(np.int64), + float=np.dtype(np.float64), + object=np.dtype(object), + category=com.pandas_dtype("category"), +) @pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 872dd03768833..a419cb0dded79 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -199,14 +199,6 @@ def test_not_string(self): # though CategoricalDtype has object kind, it cannot be string assert not is_string_dtype(CategoricalDtype()) - def test_repr_range_categories(self): - rng = pd.Index(range(3)) - dtype = CategoricalDtype(categories=rng, ordered=False) - result = repr(dtype) - - expected = "CategoricalDtype(categories=range(0, 3), ordered=False)" - assert result == expected - class TestDatetimeTZDtype(Base): @pytest.fixture diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index c9ca5cb34d271..27fac95a16b7a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -68,7 +68,7 @@ def coerce(request): ((1,), True, "tuple"), (tuple(), True, "tuple-empty"), ({"a": 1}, True, "dict"), - ({}, True, "dict-empty"), + (dict(), True, "dict-empty"), ({"a", 1}, "set", "set"), (set(), "set", "set-empty"), (frozenset({"a", 1}), "set", "frozenset"), @@ -1489,7 +1489,7 @@ def test_datetimeindex_from_empty_datetime64_array(): def test_nan_to_nat_conversions(): df = DataFrame( - {"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")} + dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) ) df.iloc[3:6, :] = np.nan result = df.loc[4, "B"] diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 922b3b94c16c1..12426a0c92c55 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -50,10 +50,6 @@ def test_view(self, data): # __setitem__ does not work, so we only have a smoke-test data.view() - @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") - def test_contains(self, data, data_missing, nulls_fixture): - super().test_contains(data, data_missing, nulls_fixture) - class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): def test_from_dtype(self, data): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index d7997310dde3d..9ae4b01508d79 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,29 +29,6 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing, nulls_fixture): - # GH-37867 - # Tests for membership checks. Membership checks for nan-likes is tricky and - # the settled on rule is: `nan_like in arr` is True if nan_like is - # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False. - - na_value = data.dtype.na_value - # ensure data without missing values - data = data[~data.isna()] - - # first elements are non-missing - assert data[0] in data - assert data_missing[0] in data_missing - - # check the presence of na_value - assert na_value in data_missing - assert na_value not in data - - if nulls_fixture is not na_value: - # the data can never contain other nan-likes than na_value - assert nulls_fixture not in data - assert nulls_fixture not in data_missing - def test_memory_usage(self, data): s = pd.Series(data) result = s.memory_usage(index=False) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1cc03d4f4f2bd..29a59cdefbd83 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -447,10 +447,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy): @pytest.mark.parametrize( "repeats, kwargs, error, msg", [ - (2, {"axis": 1}, ValueError, "axis"), - (-1, {}, ValueError, "negative"), - ([1, 2], {}, ValueError, "shape"), - (2, {"foo": "bar"}, TypeError, "'foo'"), + (2, dict(axis=1), ValueError, "axis"), + (-1, dict(), ValueError, "negative"), + ([1, 2], dict(), ValueError, "shape"), + (2, dict(foo="bar"), TypeError, "'foo'"), ], ) def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a713550dafa5c..9ede9c7fbd0fd 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -155,14 +155,6 @@ def __setitem__(self, key, value): def __len__(self) -> int: return len(self._data) - def __contains__(self, item) -> bool: - if not isinstance(item, decimal.Decimal): - return False - elif item.is_nan(): - return self.isna().any() - else: - return super().__contains__(item) - @property def nbytes(self) -> int: n = len(self) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a5e49796c53b..74ca341e27bf8 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -143,13 +143,6 @@ def test_custom_asserts(self): with pytest.raises(AssertionError, match=msg): self.assert_frame_equal(a.to_frame(), b.to_frame()) - @pytest.mark.xfail( - reason="comparison method not implemented for JSONArray (GH-37867)" - ) - def test_contains(self, data): - # GH-37867 - super().test_contains(data) - class TestConstructors(BaseJSON, base.BaseConstructorsTests): @pytest.mark.skip(reason="not implemented constructor from dtype") diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index ced7ea9261310..8acbeaf0b8170 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -130,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): elif op_name in ("__truediv__", "__rtruediv__"): # combine with bools does not generate the correct result # (numpy behaviour for div is to regard the bools as numeric) - expected = s.astype(float).combine(other, op).astype("Float64") + expected = s.astype(float).combine(other, op) if op_name == "__rpow__": # for rpow, combine does not propagate NaN expected[result.isna()] = np.nan @@ -235,10 +235,6 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): # override because there are only 2 unique values diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d03a9ab6b2588..95f338cbc3240 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -87,28 +87,6 @@ def test_memory_usage(self, data): # Is this deliberate? super().test_memory_usage(data) - def test_contains(self, data, data_missing, nulls_fixture): - # GH-37867 - # na value handling in Categorical.__contains__ is deprecated. - # See base.BaseInterFaceTests.test_contains for more details. - - na_value = data.dtype.na_value - # ensure data without missing values - data = data[~data.isna()] - - # first elements are non-missing - assert data[0] in data - assert data_missing[0] in data_missing - - # check the presence of na_value - assert na_value in data_missing - assert na_value not in data - - # Categoricals can contain other nan-likes than na_value - if nulls_fixture is not na_value: - assert nulls_fixture not in data - assert nulls_fixture in data_missing # this line differs from super method - class TestConstructors(base.BaseConstructorsTests): pass diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index c08c31e90fecc..00881178de1b4 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -184,10 +184,6 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index b1461dcbd9e53..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -130,7 +130,10 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype("Float64") + expected = expected.fillna(np.nan).astype(float) + if op_name == "__rtruediv__": + # TODO reverse operators result in object dtype + result = result.astype(float) elif op_name.startswith("__r"): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 @@ -221,10 +224,6 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - pass - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d49c4c5cf4889..db1940226e04e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -118,10 +118,6 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) - @pytest.mark.skip(reason="returns nullable") - def test_value_counts_with_normalize(self, data): - pass - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 9ec56c3429b22..15952f36b0fae 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -58,12 +58,6 @@ def test_apply(self, float_frame): assert isinstance(df["c0"].dtype, CategoricalDtype) assert isinstance(df["c1"].dtype, CategoricalDtype) - def test_apply_axis1_with_ea(self): - # GH#36785 - df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) - def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778 diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 95ebaa4641d1b..73e60ff389038 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -5,7 +5,7 @@ def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 - dtypes = {"A": "float32", "B": "float32", "C": "float16", "D": "float64"} + dtypes = dict(A="float32", B="float32", C="float16", D="float64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): @@ -21,7 +21,7 @@ def _check_mixed_float(df, dtype=None): def _check_mixed_int(df, dtype=None): - dtypes = {"A": "int32", "B": "uint64", "C": "uint8", "D": "int64"} + dtypes = dict(A="int32", B="uint64", C="uint8", D="int64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 49eb570c4ffe0..e33009f4597f0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -241,24 +241,6 @@ def inc(x): expected = DataFrame([[-1, inc], [inc, -1]]) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "cols, values, expected", - [ - (["C", "D", "D", "a"], [1, 2, 3, 4], 4), # with duplicates - (["D", "C", "D", "a"], [1, 2, 3, 4], 4), # mixed order - (["C", "B", "B", "a"], [1, 2, 3, 4], 4), # other duplicate cols - (["C", "B", "a"], [1, 2, 3], 3), # no duplicates - (["B", "C", "a"], [3, 2, 1], 1), # alphabetical order - (["C", "a", "B"], [3, 2, 1], 2), # in the middle - ], - ) - def test_setitem_same_column(self, cols, values, expected): - # GH 23239 - df = DataFrame([values], columns=cols) - df["a"] = df["a"] - result = df["a"].values[0] - assert result == expected - def test_getitem_boolean( self, float_string_frame, mixed_float_frame, mixed_int_frame, datetime_frame ): @@ -732,7 +714,7 @@ def test_setitem_empty(self): tm.assert_frame_equal(result, df) @pytest.mark.parametrize("dtype", ["float", "int64"]) - @pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}]) + @pytest.mark.parametrize("kwargs", [dict(), dict(index=[1]), dict(columns=["A"])]) def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): # see gh-10126 kwargs["dtype"] = dtype @@ -1256,7 +1238,7 @@ def test_single_element_ix_dont_upcast(self, float_frame): assert is_integer(result) # GH 11617 - df = DataFrame({"a": [1.23]}) + df = DataFrame(dict(a=[1.23])) df["b"] = 666 result = df.loc[0, "b"] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 884cb6c20b77e..e4a66ea9133dd 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -319,24 +319,6 @@ def test_setitem_bool_with_numeric_index(self, dtype): tm.assert_index_equal(df.columns, expected_cols) -class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self): - # GH#38148 - df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) - - # get one column as a view of df - ser = df["a"] - - # add columns with list-like indexer - df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) - - # edit in place the first column to check view semantics - df.iloc[0, 0] = 100 - - expected = Series([100, 2, 3], name="a") - tm.assert_series_equal(ser, expected) - - class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): # GH#31469 diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index acdb5726e4adb..3495247585236 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -356,11 +356,11 @@ def test_where_datetime(self): # GH 3311 df = DataFrame( - { - "A": date_range("20130102", periods=5), - "B": date_range("20130104", periods=5), - "C": np.random.randn(5), - } + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) ) stamp = datetime(2013, 1, 3) @@ -618,7 +618,7 @@ def test_df_where_change_dtype(self): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("kwargs", [{}, {"other": None}]) + @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) def test_df_where_with_category(self, kwargs): # GH#16979 df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index d79969eac0323..f05c90f37ea8a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -563,7 +563,7 @@ def test_astype_empty_dtype_dict(self): # issue mentioned further down in the following issue's thread # https://github.com/pandas-dev/pandas/issues/33113 df = DataFrame() - result = df.astype({}) + result = df.astype(dict()) tm.assert_frame_equal(result, df) assert result is not df diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 934ad9eb8213a..08c4293323500 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -103,7 +103,6 @@ def test_combine_first_mixed_bug(self): combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 - def test_combine_first_same_as_in_update(self): # gh 3016 (same as in update) df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], @@ -119,7 +118,6 @@ def test_combine_first_same_as_in_update(self): df.loc[0, "A"] = 45 tm.assert_frame_equal(result, df) - def test_combine_first_doc_example(self): # doc example df1 = DataFrame( {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} @@ -136,56 +134,38 @@ def test_combine_first_doc_example(self): expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) tm.assert_frame_equal(result, expected) - def test_combine_first_return_obj_type_with_bools(self): - # GH3552 - + # GH3552, return object dtype with bools df1 = DataFrame( [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - expected = Series([True, True, False], name=2, dtype=object) - - result_12 = df1.combine_first(df2)[2] - tm.assert_series_equal(result_12, expected) - - result_21 = df2.combine_first(df1)[2] - tm.assert_series_equal(result_21, expected) - - @pytest.mark.parametrize( - "data1, data2, data_expected", - ( - ( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [None, None, None], - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - ), - ( - [None, None, None], - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - ), - ( - [datetime(2000, 1, 2), None, None], - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)], - ), - ( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [datetime(2000, 1, 2), None, None], - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - ), - ), - ) - def test_combine_first_convert_datatime_correctly( - self, data1, data2, data_expected - ): - # GH 3593 + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + tm.assert_series_equal(result, expected) - df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2}) - result = df1.combine_first(df2) - expected = DataFrame({"a": data_expected}) - tm.assert_frame_equal(result, expected) + # GH 3593, converting datetime64[ns] incorrectly + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + tm.assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) + + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + tm.assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) def test_combine_first_align_nan(self): # GH 7509 (not fixed) @@ -359,14 +339,9 @@ def test_combine_first_int(self): df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = DataFrame({"a": [1, 4]}, dtype="int64") - result_12 = df1.combine_first(df2) - expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") - tm.assert_frame_equal(result_12, expected_12) - - result_21 = df2.combine_first(df1) - expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") - - tm.assert_frame_equal(result_21, expected_21) + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): @@ -392,26 +367,6 @@ def test_combine_first_string_dtype_only_na(self): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "scalar1, scalar2", - [ - (datetime(2020, 1, 1), datetime(2020, 1, 2)), - (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), - (pd.Timedelta("89 days"), pd.Timedelta("60 min")), - (pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")), - ], -) -def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): - # GH28481 - na_value = nulls_fixture - frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) - other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) - - result = frame.combine_first(other) - expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"]) - tm.assert_frame_equal(result, expected) - - def test_combine_first_with_nan_multiindex(): # gh-36562 diff --git a/pandas/tests/frame/methods/test_convert.py b/pandas/tests/frame/methods/test_convert.py index a00b2b5960884..50add248f9614 100644 --- a/pandas/tests/frame/methods/test_convert.py +++ b/pandas/tests/frame/methods/test_convert.py @@ -43,9 +43,9 @@ def test_convert_objects(self, float_string_frame): converted["H"].astype("int32") # mixed in a single column - df = DataFrame({"s": Series([1, "na", 3, 4])}) + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) result = df._convert(datetime=True, numeric=True) - expected = DataFrame({"s": Series([1, np.nan, 3, 4])}) + expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) tm.assert_frame_equal(result, expected) def test_convert_objects_no_conversion(self): diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index b8328b43a6b13..8affcce478cf4 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -132,10 +132,10 @@ def test_diff_datetime_axis1(self, tz): def test_diff_timedelta(self): # GH#4533 df = DataFrame( - { - "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], - "value": [1.0, 2.0], - } + dict( + time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + value=[1.0, 2.0], + ) ) res = df.diff() diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index b1d3890540bf9..79b152b677dfd 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -459,12 +459,3 @@ def test_drop_duplicates_series_vs_dataframe(keep): dropped_frame = df[[column]].drop_duplicates(keep=keep) dropped_series = df[column].drop_duplicates(keep=keep) tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) - - -@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0]) -def test_drop_duplicates_non_boolean_ignore_index(arg): - # GH#38274 - df = DataFrame({"a": [1, 2, 1, 3]}) - msg = '^For argument "ignore_index" expected type bool, received type .*.$' - with pytest.raises(ValueError, match=msg): - df.drop_duplicates(ignore_index=arg) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b427611099be3..d59b70fa91a57 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -53,10 +53,10 @@ def test_fillna_mixed_float(self, mixed_float_frame): mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) mf.loc[mf.index[-10:], "A"] = np.nan result = mf.fillna(value=0) - _check_mixed_float(result, dtype={"C": None}) + _check_mixed_float(result, dtype=dict(C=None)) result = mf.fillna(method="pad") - _check_mixed_float(result, dtype={"C": None}) + _check_mixed_float(result, dtype=dict(C=None)) def test_fillna_empty(self): # empty frame (GH#2778) @@ -262,7 +262,7 @@ def test_fillna_dtype_conversion(self): tm.assert_frame_equal(result, expected) # equiv of replace - df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) + df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) for v in ["", 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 1080d97b30987..857dd0ad7268b 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -76,8 +76,8 @@ def test_rename(self, float_frame): @pytest.mark.parametrize( "args,kwargs", [ - ((ChainMap({"A": "a"}, {"B": "b"}),), {"axis": "columns"}), - ((), {"columns": ChainMap({"A": "a"}, {"B": "b"})}), + ((ChainMap({"A": "a"}, {"B": "b"}),), dict(axis="columns")), + ((), dict(columns=ChainMap({"A": "a"}, {"B": "b"}))), ], ) def test_rename_chainmap(self, args, kwargs): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ab750bca7e069..8e59dd959ab57 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1123,7 +1123,7 @@ def test_replace_series_no_regex(self): tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): - df = DataFrame({"A": [np.nan, 1]}) + df = DataFrame(dict(A=[np.nan, 1])) res1 = df.replace(to_replace={np.nan: 0, 1: -1e8}) res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 00d4a4277a42f..5864b547a552b 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -618,7 +618,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): # https://github.com/pandas-dev/pandas/issues/35657 - df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")}) + df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01"))) df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() result = df.reset_index() expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index b94f54a4819c0..be5f3ee9c8191 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -305,11 +305,11 @@ def test_sort_values_nat_values_in_int_column(self): float_values = (2.0, -1.797693e308) df = DataFrame( - {"int": int_values, "float": float_values}, columns=["int", "float"] + dict(int=int_values, float=float_values), columns=["int", "float"] ) df_reversed = DataFrame( - {"int": int_values[::-1], "float": float_values[::-1]}, + dict(int=int_values[::-1], float=float_values[::-1]), columns=["int", "float"], index=[1, 0], ) @@ -329,12 +329,12 @@ def test_sort_values_nat_values_in_int_column(self): # and now check if NaT is still considered as "na" for datetime64 # columns: df = DataFrame( - {"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values}, + dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), columns=["datetime", "float"], ) df_reversed = DataFrame( - {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, + dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]), columns=["datetime", "float"], index=[1, 0], ) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 4cf0b1febf0af..7babc6853aef3 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -38,7 +38,7 @@ class TestDataFrameToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "parse_dates": True} + params = dict(index_col=0, parse_dates=True) params.update(**kwargs) return pd.read_csv(path, **params) @@ -248,10 +248,10 @@ def make_dtnat_arr(n, nnat=None): # s3=make_dtnjat_arr(chunksize+5,0) with tm.ensure_clean("1.csv") as pth: - df = DataFrame({"a": s1, "b": s2}) + df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) - recons = self.read_csv(pth).apply(to_datetime) + recons = self.read_csv(pth)._convert(datetime=True, coerce=True) tm.assert_frame_equal(df, recons, check_names=False) @pytest.mark.slow @@ -260,7 +260,7 @@ def _do_test( df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False ): - kwargs = {"parse_dates": False} + kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs["index_col"] = list(range(rnlvl)) @@ -291,7 +291,7 @@ def _to_uni(x): recons.index = ix recons = recons.iloc[:, rnlvl - 1 :] - type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"} + type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: if r_dtype == "u": # unicode r_dtype = "O" @@ -738,7 +738,7 @@ def create_cols(name): df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) # dtype - dtypes = {} + dtypes = dict() for n, dtype in [ ("float", np.float64), ("int", np.int64), diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 4d40f191a904b..d9c999c9119f4 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -131,7 +131,7 @@ def test_to_records_with_categorical(self): [ # No dtypes --> default to array dtypes. ( - {}, + dict(), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", " cast to object via concat_compat - result = ci.append(Index(["a", "d"])) - expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"]) - tm.assert_index_equal(result, expected, exact=True) + # invalid objects + msg = "cannot append a non-category item to a CategoricalIndex" + with pytest.raises(TypeError, match=msg): + ci.append(Index(["a", "d"])) # GH14298 - if base object is not categorical -> coerce to object result = Index(["c", "a"]).append(ci) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index d098e5b639f25..e250d8cf1b326 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -73,10 +73,7 @@ def test_shift(self): # GH8083 test the base class for shift idx = self.create_index() - msg = ( - f"This method is only implemented for DatetimeIndex, PeriodIndex and " - f"TimedeltaIndex; Got type {type(idx).__name__}" - ) + msg = f"Not supported for type {type(idx).__name__}" with pytest.raises(NotImplementedError, match=msg): idx.shift(1) with pytest.raises(NotImplementedError, match=msg): diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 789510b452969..2657fc817ec3a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -265,12 +265,10 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -279,7 +277,6 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq idx2 = DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -290,31 +287,21 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq - def test_factorize_preserves_freq(self): - # GH#38120 freq should be preserved + # freq must be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) - arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) - assert idx.freq == idx3.freq - - arr, idx = pd.factorize(idx3) - tm.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, idx3) - assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture, index_or_series): + def test_factorize_tz(self, tz_naive_fixture): tz = tz_naive_fixture # GH#13750 base = date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -322,33 +309,27 @@ def test_factorize_tz(self, tz_naive_fixture, index_or_series): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - obj = index_or_series(idx) - - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) - assert res.freq == expected.freq + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) - def test_factorize_dst(self, index_or_series): + def test_factorize_dst(self): # GH 13750 idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") - obj = index_or_series(idx) - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) - if index_or_series is Index: - assert res.freq == idx.freq + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") - obj = index_or_series(idx) - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) - if index_or_series is Index: - assert res.freq == idx.freq + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 93772e2c27a82..c8edd30e3f7aa 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -471,11 +471,10 @@ def test_intersection_bug(self): def test_intersection_list(self): # GH#35876 - # values is not an Index -> no name -> retain "a" values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] idx = DatetimeIndex(values, name="a") res = idx.intersection(values) - tm.assert_index_equal(res, idx) + tm.assert_index_equal(res, idx.rename(None)) def test_month_range_union_tz_pytz(self, sort): from pytz import timezone @@ -510,20 +509,6 @@ def test_month_range_union_tz_dateutil(self, sort): early_dr.union(late_dr, sort=sort) - @pytest.mark.parametrize("sort", [False, None]) - def test_intersection_duplicates(self, sort): - # GH#38196 - idx1 = Index( - [ - pd.Timestamp("2019-12-13"), - pd.Timestamp("2019-12-12"), - pd.Timestamp("2019-12-12"), - ] - ) - result = idx1.intersection(idx1, sort=sort) - expected = Index([pd.Timestamp("2019-12-13"), pd.Timestamp("2019-12-12")]) - tm.assert_index_equal(result, expected) - class TestCustomDatetimeIndex: def setup_method(self, method): diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index fb59334b2e129..0e8d7d1ba5aba 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,15 +1,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, - Float64Index, - Interval, - IntervalIndex, - Series, - Timedelta, - Timestamp, -) +from pandas import DataFrame, IntervalIndex, Series, Timedelta, Timestamp import pandas._testing as tm @@ -45,25 +37,6 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected - def test_repr_floats(self): - # GH 32553 - - markers = Series( - ["foo", "bar"], - index=IntervalIndex( - [ - Interval(left, right) - for left, right in zip( - Float64Index([329.973, 345.137], dtype="float64"), - Float64Index([345.137, 360.191], dtype="float64"), - ) - ] - ), - ) - result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" - assert result == expected - @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 0ef833bb93ded..0b94d70367b4d 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -32,17 +32,15 @@ def test_union(self, closed, sort): tm.assert_index_equal(index.union(index, sort=sort), index) tm.assert_index_equal(index.union(index[:1], sort=sort), index) - def test_union_empty_result(self, closed, sort): # GH 19101: empty result, same dtype index = empty_index(dtype="int64", closed=closed) result = index.union(index, sort=sort) tm.assert_index_equal(result, index) - # GH 19101: empty result, different dtypes -> common dtype is object + # GH 19101: empty result, different dtypes other = empty_index(dtype="float64", closed=closed) result = index.union(other, sort=sort) - expected = Index([], dtype=object) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, index) def test_intersection(self, closed, sort): index = monotonic_index(0, 11, closed=closed) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 25e2f6a3777d1..bd4926880c13d 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -11,8 +11,7 @@ def test_shift(idx): # GH8083 test the base class for shift - msg = "This method is only implemented for DatetimeIndex, PeriodIndex and " - "TimedeltaIndex; Got type MultiIndex" + msg = "Not supported for type MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.shift(1) with pytest.raises(NotImplementedError, match=msg): diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index f7b1bc4729428..c39954b22b0f2 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import pytest @@ -151,16 +149,6 @@ def test_drop_with_nan_in_index(nulls_fixture): mi.drop(pd.Timestamp("2001"), level="date") -def test_drop_with_non_monotonic_duplicates(): - # GH#33494 - mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)]) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", PerformanceWarning) - result = mi.drop((1, 2)) - expected = MultiIndex.from_tuples([(2, 3)]) - tm.assert_index_equal(result, expected) - - def test_single_level_drop_partially_missing_elements(): # GH 37820 diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 51538c556de15..4ac9a27069a3f 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import Index, MultiIndex, Series +from pandas import MultiIndex, Series import pandas._testing as tm @@ -294,24 +294,6 @@ def test_intersection(idx, sort): # assert result.equals(tuples) -def test_intersection_non_object(idx, sort): - other = Index(range(3), name="foo") - - result = idx.intersection(other, sort=sort) - expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=None) - tm.assert_index_equal(result, expected, exact=True) - - # if we pass a length-0 ndarray (i.e. no name, we retain our idx.name) - result = idx.intersection(np.asarray(other)[:0], sort=sort) - expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=idx.names) - tm.assert_index_equal(result, expected, exact=True) - - msg = "other must be a MultiIndex or a list of tuples" - with pytest.raises(TypeError, match=msg): - # With non-zero length non-index, we try and fail to convert to tuples - idx.intersection(np.asarray(other), sort=sort) - - def test_intersect_equal_sort(): # GH-24959 idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) @@ -396,26 +378,3 @@ def test_setops_disallow_true(method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) - - -@pytest.mark.parametrize( - ("tuples", "exp_tuples"), - [ - ([("val1", "test1")], [("val1", "test1")]), - ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]), - ( - [("val2", "test2"), ("val1", "test1")], - [("val2", "test2"), ("val1", "test1")], - ), - ], -) -def test_intersect_with_duplicates(tuples, exp_tuples): - # GH#36915 - left = MultiIndex.from_tuples(tuples, names=["first", "second"]) - right = MultiIndex.from_tuples( - [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], - names=["first", "second"], - ) - result = left.intersection(right) - expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index c03c89f32f73e..9b203e1b17517 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -21,28 +21,6 @@ ) import pandas._testing as tm -dti4 = date_range("2016-01-01", periods=4) -dti = dti4[:-1] -rng = pd.Index(range(3)) - - -@pytest.fixture( - params=[ - dti, - dti.tz_localize("UTC"), - dti.to_period("W"), - dti - dti[0], - rng, - pd.Index([1, 2, 3]), - pd.Index([2.0, 3.0, 4.0]), - pd.Index([4, 5, 6], dtype="u8"), - pd.IntervalIndex.from_breaks(dti4), - ] -) -def non_comparable_idx(request): - # All have length 3 - return request.param - class TestGetItem: def test_ellipsis(self): @@ -460,37 +438,6 @@ def test_get_indexer_mismatched_dtype(self): result = pi.get_indexer_non_unique(pi2)[0] tm.assert_numpy_array_equal(result, expected) - def test_get_indexer_mismatched_dtype_different_length(self, non_comparable_idx): - # without method we arent checking inequalities, so get all-missing - # but do not raise - dti = date_range("2016-01-01", periods=3) - pi = dti.to_period("D") - - other = non_comparable_idx - - res = pi[:-1].get_indexer(other) - expected = -np.ones(other.shape, dtype=np.intp) - tm.assert_numpy_array_equal(res, expected) - - @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) - def test_get_indexer_mismatched_dtype_with_method(self, non_comparable_idx, method): - dti = date_range("2016-01-01", periods=3) - pi = dti.to_period("D") - - other = non_comparable_idx - - msg = re.escape(f"Cannot compare dtypes {pi.dtype} and {other.dtype}") - with pytest.raises(TypeError, match=msg): - pi.get_indexer(other, method=method) - - for dtype in ["object", "category"]: - other2 = other.astype(dtype) - if dtype == "object" and isinstance(other, PeriodIndex): - continue - # For object dtype we are liable to get a different exception message - with pytest.raises(TypeError): - pi.get_indexer(other2, method=method) - def test_get_indexer_non_unique(self): # GH 17717 p1 = Period("2017-09-02") diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index f354682bf6f70..878a89bd52cb1 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -94,7 +94,7 @@ def test_range_slice_outofbounds(self, make_range): def test_maybe_cast_slice_bound(self, make_range, frame_or_series): idx = make_range(start="2013/10/01", freq="D", periods=10) - obj = DataFrame({"units": [100 + i for i in range(10)]}, index=idx) + obj = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) if frame_or_series is not DataFrame: obj = obj["units"] diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 660269f2d02a4..1fd41b017221b 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -3,52 +3,11 @@ import numpy as np import pytest -from pandas import Index, Int64Index, RangeIndex, UInt64Index +from pandas import Index, Int64Index, RangeIndex import pandas._testing as tm class TestRangeIndexSetOps: - @pytest.mark.parametrize("klass", [RangeIndex, Int64Index, UInt64Index]) - def test_intersection_mismatched_dtype(self, klass): - # check that we cast to float, not object - index = RangeIndex(start=0, stop=20, step=2, name="foo") - index = klass(index) - - flt = index.astype(np.float64) - - # bc index.equals(flt), we go through fastpath and get RangeIndex back - result = index.intersection(flt) - tm.assert_index_equal(result, index, exact=True) - - result = flt.intersection(index) - tm.assert_index_equal(result, flt, exact=True) - - # neither empty, not-equals - result = index.intersection(flt[1:]) - tm.assert_index_equal(result, flt[1:], exact=True) - - result = flt[1:].intersection(index) - tm.assert_index_equal(result, flt[1:], exact=True) - - # empty other - result = index.intersection(flt[:0]) - tm.assert_index_equal(result, flt[:0], exact=True) - - result = flt[:0].intersection(index) - tm.assert_index_equal(result, flt[:0], exact=True) - - def test_intersection_empty(self, sort, names): - # name retention on empty intersections - index = RangeIndex(start=0, stop=20, step=2, name=names[0]) - - # empty other - result = index.intersection(index[:0].rename(names[1]), sort=sort) - tm.assert_index_equal(result, index[:0].rename(names[2]), exact=True) - - # empty self - result = index[:0].intersection(index.rename(names[1]), sort=sort) - tm.assert_index_equal(result, index[:0].rename(names[2]), exact=True) - def test_intersection(self, sort): # intersect with Int64Index index = RangeIndex(start=0, stop=20, step=2) @@ -90,12 +49,12 @@ def test_intersection(self, sort): result = other.intersection(first, sort=sort).astype(int) tm.assert_index_equal(result, expected) - index = RangeIndex(5, name="foo") + index = RangeIndex(5) # intersect of non-overlapping indices - other = RangeIndex(5, 10, 1, name="foo") + other = RangeIndex(5, 10, 1) result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1, name="foo") + expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) other = RangeIndex(-1, -5, -1) @@ -112,12 +71,11 @@ def test_intersection(self, sort): result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected) - def test_intersection_non_overlapping_gcd(self, sort, names): # intersection of non-overlapping values based on start value and gcd - index = RangeIndex(1, 10, 2, name=names[0]) - other = RangeIndex(0, 10, 4, name=names[1]) + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1, name=names[2]) + expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) def test_union_noncomparable(self, sort): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 372a1d290bca0..ba49c51c9db8e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,6 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.compat import IS64 from pandas.compat.numpy import np_datetime64_compat from pandas.util._test_decorators import async_mark @@ -20,7 +19,6 @@ DatetimeIndex, Float64Index, Int64Index, - IntervalIndex, PeriodIndex, RangeIndex, Series, @@ -1251,9 +1249,10 @@ def test_get_indexer_numeric_index_boolean_target(self, method, idx_class): if method == "get_indexer": tm.assert_numpy_array_equal(result, expected) else: - missing = np.arange(3, dtype=np.intp) + expected = np.array([-1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], expected) - tm.assert_numpy_array_equal(result[1], missing) + tm.assert_numpy_array_equal(result[1], expected) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 @@ -1506,17 +1505,6 @@ def test_drop_tuple(self, values, to_drop): with pytest.raises(KeyError, match=msg): removed.drop(drop_me) - def test_drop_with_duplicates_in_index(self, index): - # GH38051 - if len(index) == 0 or isinstance(index, MultiIndex): - return - if isinstance(index, IntervalIndex) and not IS64: - pytest.skip("Cannot test IntervalIndex with int64 dtype on 32 bit platform") - index = index.unique().repeat(2) - expected = index[2:] - result = index.drop(index[0]) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "attr", [ @@ -2358,6 +2346,5 @@ def construct(dtype): else: no_matches = np.array([-1] * 6, dtype=np.intp) - missing = np.arange(6, dtype=np.intp) tm.assert_numpy_array_equal(result[0], no_matches) - tm.assert_numpy_array_equal(result[1], missing) + tm.assert_numpy_array_equal(result[1], no_matches) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 8bfb97ca494e6..b71417b2a625d 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -74,16 +74,14 @@ def test_numpy_ufuncs_basic(index, func): @pytest.mark.parametrize( "func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__ ) -def test_numpy_ufuncs_other(index, func, request): +def test_numpy_ufuncs_other(index, func): # test ufuncs of numpy, see: # https://numpy.org/doc/stable/reference/ufuncs.html if isinstance(index, (DatetimeIndex, TimedeltaIndex)): if isinstance(index, DatetimeIndex) and index.tz is not None: if func in [np.isfinite, np.isnan, np.isinf]: - if not np_version_under1p17: - mark = pytest.mark.xfail(reason="__array_ufunc__ is not defined") - request.node.add_marker(mark) + pytest.xfail(reason="__array_ufunc__ is not defined") if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 6f949960ce30b..0973cef7cfdc1 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -98,20 +98,13 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): ("Period[D]", "float64", "object"), ], ) -@pytest.mark.parametrize("names", [("foo", "foo", "foo"), ("foo", "bar", None)]) -def test_union_dtypes(left, right, expected, names): +def test_union_dtypes(left, right, expected): left = pandas_dtype(left) right = pandas_dtype(right) - a = pd.Index([], dtype=left, name=names[0]) - b = pd.Index([], dtype=right, name=names[1]) - result = a.union(b) - assert result.dtype == expected - assert result.name == names[2] - - # Testing name retention - # TODO: pin down desired dtype; do we want it to be commutative? - result = a.intersection(b) - assert result.name == names[2] + a = pd.Index([], dtype=left) + b = pd.Index([], dtype=right) + result = a.union(b).dtype + assert result == expected def test_dunder_inplace_setops_deprecated(index): @@ -127,16 +120,6 @@ def test_dunder_inplace_setops_deprecated(index): index ^= index -@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) -def test_intersection_duplicates(values): - # GH#31326 - a = pd.Index(values) - b = pd.Index([3, 3]) - result = a.intersection(b) - expected = pd.Index([3]) - tm.assert_index_equal(result, expected) - - class TestSetOps: # Set operation tests shared by all indexes in the `index` fixture @pytest.mark.parametrize("case", [0.5, "xxx"]) @@ -395,25 +378,6 @@ def test_intersect_unequal(self, index, fname, sname, expected_name): expected = index[1:].set_names(expected_name).sort_values() tm.assert_index_equal(intersect, expected) - def test_intersection_name_retention_with_nameless(self, index): - if isinstance(index, MultiIndex): - index = index.rename(list(range(index.nlevels))) - else: - index = index.rename("foo") - - other = np.asarray(index) - - result = index.intersection(other) - assert result.name == index.name - - # empty other, same dtype - result = index.intersection(other[:0]) - assert result.name == index.name - - # empty `self` - result = index[:0].intersection(other) - assert result.name == index.name - def test_difference_preserves_type_empty(self, index, sort): # GH#20040 # If taking difference of a set and itself, it @@ -424,18 +388,6 @@ def test_difference_preserves_type_empty(self, index, sort): expected = index[:0] tm.assert_index_equal(result, expected, exact=True) - def test_difference_name_retention_equals(self, index, sort, names): - if isinstance(index, MultiIndex): - names = [[x] * index.nlevels for x in names] - index = index.rename(names[0]) - other = index.rename(names[1]) - - assert index.equals(other) - - result = index.difference(other) - expected = index[:0].rename(names[2]) - tm.assert_index_equal(result, expected) - def test_intersection_difference_match_empty(self, index, sort): # GH#20040 # Test that the intersection of an index with an diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index f0e730eecf3d5..774370ed866da 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -75,26 +75,17 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - assert idx.freq == exp_idx.freq - def test_factorize_preserves_freq(self): - # GH#38120 freq should be preserved + # freq must be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) - assert idx.freq == idx3.freq - - arr, idx = pd.factorize(idx3) - tm.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(idx, idx3) - assert idx.freq == idx3.freq def test_sort_values(self): diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 9a3039c28416c..a3b8d66c92024 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -83,13 +83,3 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) - - def test_multiindex_with_datatime_level_preserves_freq(self): - # https://github.com/pandas-dev/pandas/issues/35563 - idx = Index(range(2), name="A") - dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") - mi = MultiIndex.from_product([idx, dti]) - df = DataFrame(np.random.randn(14, 2), index=mi) - result = df.loc[0].index - tm.assert_index_equal(result, dti) - assert result.freq == dti.freq diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 51684f092aefd..d58bc4713f99f 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -779,13 +779,3 @@ def test_non_reducing_slice_on_multiindex(self): result = df.loc[tslice_] expected = DataFrame({("b", "d"): [4, 1]}) tm.assert_frame_equal(result, expected) - - def test_loc_slice_negative_stepsize(self): - # GH#38071 - mi = MultiIndex.from_product([["a", "b"], [0, 1]]) - df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=mi) - result = df.loc[("a", slice(None, None, -1)), :] - expected = DataFrame( - [[3, 4], [1, 2]], index=MultiIndex.from_tuples([("a", 1), ("a", 0)]) - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b9b6452b2e33..6fff706e27cd2 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -57,12 +57,9 @@ def test_loc_scalar(self): with pytest.raises(KeyError, match=r"^'d'$"): df.loc["d"] - df2 = df.copy() - expected = df2.copy() - expected.index = expected.index.astype(object) - expected.loc["d"] = 10 - df2.loc["d"] = 10 - tm.assert_frame_equal(df2, expected) + msg = "cannot append a non-category item to a CategoricalIndex" + with pytest.raises(TypeError, match=msg): + df.loc["d"] = 10 msg = "'fill_value=d' is not present in this Categorical's categories" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexing/test_indexers.py b/pandas/tests/indexing/test_indexers.py index 14b2b494d65fb..744f9441e7376 100644 --- a/pandas/tests/indexing/test_indexers.py +++ b/pandas/tests/indexing/test_indexers.py @@ -28,12 +28,6 @@ def test_is_scalar_indexer(): assert not is_scalar_indexer(slice(None), 1) - indexer = 0 - assert is_scalar_indexer(indexer, 1) - - indexer = (0,) - assert is_scalar_indexer(indexer, 1) - class TestValidateIndices: def test_validate_indices_ok(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 68f12a939e061..cf6c2878acd9a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1782,23 +1782,21 @@ def test_series_getitem_label_list_missing_integer_values(): @pytest.mark.parametrize( - "columns, column_key, expected_columns", + "columns, column_key, expected_columns, check_column_type", [ - ([2011, 2012, 2013], [2011, 2012], [0, 1]), - ([2011, 2012, "All"], [2011, 2012], [0, 1]), - ([2011, 2012, "All"], [2011, "All"], [0, 2]), + ([2011, 2012, 2013], [2011, 2012], [0, 1], True), + ([2011, 2012, "All"], [2011, 2012], [0, 1], False), + ([2011, 2012, "All"], [2011, "All"], [0, 2], True), ], ) -def test_loc_getitem_label_list_integer_labels(columns, column_key, expected_columns): +def test_loc_getitem_label_list_integer_labels( + columns, column_key, expected_columns, check_column_type +): # gh-14836 df = DataFrame(np.random.rand(3, 3), columns=columns, index=list("ABC")) expected = df.iloc[:, expected_columns] result = df.loc[["A", "B", "C"], column_key] - - if df.columns.is_object() and all(isinstance(x, int) for x in column_key): - expected.columns = expected.columns.astype(int) - - tm.assert_frame_equal(result, expected, check_column_type=True) + tm.assert_frame_equal(result, expected, check_column_type=check_column_type) def test_loc_setitem_float_intindex(): @@ -2072,21 +2070,3 @@ def test_loc_setitem_dt64tz_values(self): s2["a"] = expected result = s2["a"] assert result == expected - - @pytest.mark.parametrize("array_fn", [np.array, pd.array, list, tuple]) - @pytest.mark.parametrize("size", [0, 4, 5, 6]) - def test_loc_iloc_setitem_with_listlike(self, size, array_fn): - # GH37748 - # testing insertion, in a Series of size N (here 5), of a listlike object - # of size 0, N-1, N, N+1 - - arr = array_fn([0] * size) - expected = Series([arr, 0, 0, 0, 0], index=list("abcde"), dtype=object) - - ser = Series(0, index=list("abcde"), dtype=object) - ser.loc["a"] = arr - tm.assert_series_equal(ser, expected) - - ser = Series(0, index=list("abcde"), dtype=object) - ser.iloc[0] = arr - tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index ce48fd1e5c905..dd01f4e6a4f49 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -268,41 +268,35 @@ def test_at_with_tuple_index_set(): assert series.at[1, 2] == 3 -class TestMultiIndexScalar: - def test_multiindex_at_get(self): - # GH 26989 - # DataFrame.at and DataFrame.loc getter works with MultiIndex - df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) - assert df.index.nlevels == 2 - assert df.at[(1, 3), "a"] == 1 - assert df.loc[(1, 3), "a"] == 1 - - # Series.at and Series.loc getter works with MultiIndex - series = df["a"] - assert series.index.nlevels == 2 - assert series.at[1, 3] == 1 - assert series.loc[1, 3] == 1 - - def test_multiindex_at_set(self): - # GH 26989 - # DataFrame.at and DataFrame.loc setter works with MultiIndex - df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) - assert df.index.nlevels == 2 - df.at[(1, 3), "a"] = 3 - assert df.at[(1, 3), "a"] == 3 - df.loc[(1, 3), "a"] = 4 - assert df.loc[(1, 3), "a"] == 4 - - # Series.at and Series.loc setter works with MultiIndex - series = df["a"] - assert series.index.nlevels == 2 - series.at[1, 3] = 5 - assert series.at[1, 3] == 5 - series.loc[1, 3] = 6 - assert series.loc[1, 3] == 6 - - def test_multiindex_at_get_one_level(self): - # GH#38053 - s2 = Series((0, 1), index=[[False, True]]) - result = s2.at[False] - assert result == 0 +def test_multiindex_at_get(): + # GH 26989 + # DataFrame.at and DataFrame.loc getter works with MultiIndex + df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) + assert df.index.nlevels == 2 + assert df.at[(1, 3), "a"] == 1 + assert df.loc[(1, 3), "a"] == 1 + + # Series.at and Series.loc getter works with MultiIndex + series = df["a"] + assert series.index.nlevels == 2 + assert series.at[1, 3] == 1 + assert series.loc[1, 3] == 1 + + +def test_multiindex_at_set(): + # GH 26989 + # DataFrame.at and DataFrame.loc setter works with MultiIndex + df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) + assert df.index.nlevels == 2 + df.at[(1, 3), "a"] = 3 + assert df.at[(1, 3), "a"] == 3 + df.loc[(1, 3), "a"] = 4 + assert df.loc[(1, 3), "a"] == 4 + + # Series.at and Series.loc setter works with MultiIndex + series = df["a"] + assert series.index.nlevels == 2 + series.at[1, 3] = 5 + assert series.at[1, 3] == 5 + series.loc[1, 3] = 6 + assert series.loc[1, 3] == 6 diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bcc666a88e3be..e9f228b5973b5 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,3 @@ -import logging import os import shlex import subprocess @@ -50,8 +49,6 @@ def s3_base(worker_id): pytest.importorskip("s3fs") pytest.importorskip("boto3") requests = pytest.importorskip("requests") - # GH 38090: Suppress http logs in tests by moto_server - logging.getLogger("werkzeug").disabled = True with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..1349808277d81 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -68,11 +68,11 @@ def test_write_cells_merge_styled(ext): ] with tm.ensure_clean(ext) as path: - with _OpenpyxlWriter(path) as writer: - writer.write_cells(initial_cells, sheet_name=sheet_name) - writer.write_cells(merge_cells, sheet_name=sheet_name) + writer = _OpenpyxlWriter(path) + writer.write_cells(initial_cells, sheet_name=sheet_name) + writer.write_cells(merge_cells, sheet_name=sheet_name) - wks = writer.sheets[sheet_name] + wks = writer.sheets[sheet_name] xcell_b1 = wks["B1"] xcell_a2 = wks["A2"] assert xcell_b1.font == openpyxl_sty_merged @@ -93,8 +93,9 @@ def test_write_append_mode(ext, mode, expected): wb.worksheets[1]["A1"].value = "bar" wb.save(f) - with ExcelWriter(f, engine="openpyxl", mode=mode) as writer: - df.to_excel(writer, sheet_name="baz", index=False) + writer = ExcelWriter(f, engine="openpyxl", mode=mode) + df.to_excel(writer, sheet_name="baz", index=False) + writer.save() wb2 = openpyxl.load_workbook(f) result = [sheet.title for sheet in wb2.worksheets] diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 98a55ae39bd77..c582a0fa23577 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -577,10 +577,6 @@ def test_date_conversion_overflow(self, read_ext): if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") - if pd.read_excel.keywords["engine"] is None: - # GH 35029 - pytest.xfail("Defaults to openpyxl, maybe not supported") - result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) @@ -1163,7 +1159,7 @@ def test_excel_high_surrogate(self, engine): expected = DataFrame(["\udc88"], columns=["Column1"]) # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd") + actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 6b1abebe0506a..936fc175a493b 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -68,14 +68,15 @@ def custom_converter(css): df = DataFrame(np.random.randn(11, 3)) with tm.ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path: - with ExcelWriter(path, engine=engine) as writer: - df.to_excel(writer, sheet_name="frame") - df.style.to_excel(writer, sheet_name="unstyled") - styled = df.style.apply(style, axis=None) - styled.to_excel(writer, sheet_name="styled") - ExcelFormatter(styled, style_converter=custom_converter).write( - writer, sheet_name="custom" - ) + writer = ExcelWriter(path, engine=engine) + df.to_excel(writer, sheet_name="frame") + df.style.to_excel(writer, sheet_name="unstyled") + styled = df.style.apply(style, axis=None) + styled.to_excel(writer, sheet_name="styled") + ExcelFormatter(styled, style_converter=custom_converter).write( + writer, sheet_name="custom" + ) + writer.save() if engine not in ("openpyxl", "xlsxwriter"): # For other engines, we only smoke test diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 80ebeb4c03d89..8da9c79160e91 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,15 +351,12 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - elif engine == "xlwt": + else: import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") - else: - with pytest.raises(KeyError, match="Worksheet 0 does not exist."): - pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -472,12 +469,12 @@ def test_int_types(self, np_type, path): # Test with convert_float=False comes back as float. float_frame = df.astype(float) - float_frame.columns = float_frame.columns.astype(float) - float_frame.index = float_frame.index.astype(float) recons = pd.read_excel( path, sheet_name="test1", convert_float=False, index_col=0 ) - tm.assert_frame_equal(recons, float_frame) + tm.assert_frame_equal( + recons, float_frame, check_index_type=False, check_column_type=False + ) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, path): @@ -525,9 +522,10 @@ def test_sheets(self, frame, tsframe, path): frame.to_excel(path, "test1", index=False) # Test writing to separate sheets - with ExcelWriter(path) as writer: - frame.to_excel(writer, "test1") - tsframe.to_excel(writer, "test2") + writer = ExcelWriter(path) + frame.to_excel(writer, "test1") + tsframe.to_excel(writer, "test2") + writer.close() reader = ExcelFile(path) recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) @@ -1195,24 +1193,23 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd - engine = "odf" if path.endswith("ods") else "xlrd" - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) def test_bytes_io(self, engine): # see gh-7074 - with BytesIO() as bio: - df = DataFrame(np.random.randn(10, 2)) + bio = BytesIO() + df = DataFrame(np.random.randn(10, 2)) - # Pass engine explicitly, as there is no file path to infer from. - with ExcelWriter(bio, engine=engine) as writer: - df.to_excel(writer) + # Pass engine explicitly, as there is no file path to infer from. + writer = ExcelWriter(bio, engine=engine) + df.to_excel(writer) + writer.save() - bio.seek(0) - reread_df = pd.read_excel(bio, index_col=0) - tm.assert_frame_equal(df, reread_df) + bio.seek(0) + reread_df = pd.read_excel(bio, index_col=0) + tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): # see gh-8188. @@ -1320,12 +1317,12 @@ class TestExcelWriterEngineTests: ) def test_ExcelWriter_dispatch(self, klass, ext): with tm.ensure_clean(ext) as path: - with ExcelWriter(path) as writer: - if ext == ".xlsx" and td.safe_import("xlsxwriter"): - # xlsxwriter has preference over openpyxl if both installed - assert isinstance(writer, _XlsxWriter) - else: - assert isinstance(writer, klass) + writer = ExcelWriter(path) + if ext == ".xlsx" and td.safe_import("xlsxwriter"): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): with pytest.raises(ValueError, match="No engine"): @@ -1359,8 +1356,8 @@ def check_called(func): path = "something.xlsx" with tm.ensure_clean(path) as filepath: register_writer(DummyClass) - with ExcelWriter(filepath) as writer: - assert isinstance(writer, DummyClass) + writer = ExcelWriter(filepath) + assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) check_called(lambda: df.to_excel(filepath)) with tm.ensure_clean("something.xls") as filepath: @@ -1380,5 +1377,5 @@ def test_excelfile_fspath(self): def test_excelwriter_fspath(self): with tm.ensure_clean("foo.xlsx") as path: - with ExcelWriter(path) as writer: - assert os.fspath(writer) == str(path) + writer = ExcelWriter(path) + assert os.fspath(writer) == str(path) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index f2fbcbc2e2f04..26190edaa4960 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,7 +1,5 @@ import pytest -from pandas.compat._optional import import_optional_dependency - import pandas as pd import pandas._testing as tm @@ -40,48 +38,6 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") - with ExcelFile(path, engine="xlrd") as excel: + with ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") - - -def test_excel_file_warning_with_xlsx_file(datapath): - # GH 29375 - path = datapath("io", "data", "excel", "test1.xlsx") - has_openpyxl = ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ) - if not has_openpyxl: - with tm.assert_produces_warning( - FutureWarning, - raise_on_extra_warnings=False, - match="The xlrd engine is no longer maintained", - ): - ExcelFile(path, engine=None) - else: - with tm.assert_produces_warning(None): - pd.read_excel(path, "Sheet1", engine=None) - - -def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): - # GH 29375 - path = datapath("io", "data", "excel", "test1.xlsx") - has_openpyxl = ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ) - if not has_openpyxl: - with tm.assert_produces_warning( - FutureWarning, - raise_on_extra_warnings=False, - match="The xlrd engine is no longer maintained", - ): - pd.read_excel(path, "Sheet1", engine=None) - else: - with tm.assert_produces_warning(None): - pd.read_excel(path, "Sheet1", engine=None) diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 6de378f6a3d3e..b6f791434a92b 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -23,15 +23,16 @@ def test_column_format(ext): with tm.ensure_clean(ext) as path: frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) - with ExcelWriter(path) as writer: - frame.to_excel(writer) - - # Add a number format to col B and ensure it is applied to cells. - num_format = "#,##0" - write_workbook = writer.book - write_worksheet = write_workbook.worksheets()[0] - col_format = write_workbook.add_format({"num_format": num_format}) - write_worksheet.set_column("B:B", None, col_format) + writer = ExcelWriter(path) + frame.to_excel(writer) + + # Add a number format to col B and ensure it is applied to cells. + num_format = "#,##0" + write_workbook = writer.book + write_worksheet = write_workbook.worksheets()[0] + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) + writer.save() read_workbook = openpyxl.load_workbook(path) try: diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 4f1af132204bb..8529a0fb33b67 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -278,7 +278,7 @@ def test_css_to_excel_good_colors(input_color, output_color): f"color: {input_color}" ) - expected = {} + expected = dict() expected["fill"] = {"patternType": "solid", "fgColor": output_color} @@ -305,7 +305,7 @@ def test_css_to_excel_bad_colors(input_color): f"color: {input_color}" ) - expected = {} + expected = dict() if input_color is not None: expected["fill"] = {"patternType": "solid"} diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a88dec84bd693..aaadc965aca52 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -152,8 +152,8 @@ def test_to_html_decimal(datapath): @pytest.mark.parametrize( "kwargs,string,expected", [ - ({}, "", "escaped"), - ({"escape": False}, "bold", "escape_disabled"), + (dict(), "", "escaped"), + (dict(escape=False), "bold", "escape_disabled"), ], ) def test_to_html_escaped(kwargs, string, expected, datapath): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index ba6d7c010613b..81e8e0bd2b526 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -92,7 +92,7 @@ def test_to_latex_tabular_without_index(self): @pytest.mark.parametrize( "bad_column_format", - [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, {"a": "r", "b": "l"}], + [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, dict(a="r", b="l")], ) def test_to_latex_bad_column_format(self, bad_column_format): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5faca6bd89dad..a41af9886c617 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -65,10 +65,8 @@ def test_chunksize_with_compression(compression): df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') df.to_json(path, orient="records", lines=True, compression=compression) - with pd.read_json( - path, lines=True, chunksize=1, compression=compression - ) as res: - roundtripped_df = pd.concat(res) + res = pd.read_json(path, lines=True, chunksize=1, compression=compression) + roundtripped_df = pd.concat(res) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 244302e34337d..8d93fbcc063f4 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -521,13 +521,13 @@ def test_meta_non_iterable(self): class TestNestedToRecord: def test_flat_stays_flat(self): - recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}] + recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4)] result = nested_to_record(recs) expected = recs assert result == expected def test_one_level_deep_flattens(self): - data = {"flat1": 1, "dict1": {"c": 1, "d": 2}} + data = dict(flat1=1, dict1=dict(c=1, d=2)) result = nested_to_record(data) expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1} @@ -535,11 +535,7 @@ def test_one_level_deep_flattens(self): assert result == expected def test_nested_flattens(self): - data = { - "flat1": 1, - "dict1": {"c": 1, "d": 2}, - "nested": {"e": {"c": 1, "d": 2}, "d": 2}, - } + data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) result = nested_to_record(data) expected = { diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ce95eb59ed3c4..fdf2caa804def 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -379,7 +379,7 @@ def test_frame_infinity(self, orient, inf, dtype): ], ) def test_frame_to_json_float_precision(self, value, precision, expected_val): - df = DataFrame([{"a_float": value}]) + df = DataFrame([dict(a_float=value)]) encoded = df.to_json(double_precision=precision) assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}' @@ -475,8 +475,8 @@ def test_blocks_compat_GH9037(self): index = DatetimeIndex(list(index), freq=None) df_mixed = DataFrame( - { - "float_1": [ + dict( + float_1=[ -0.92077639, 0.77434435, 1.25234727, @@ -488,7 +488,7 @@ def test_blocks_compat_GH9037(self): 0.95748401, -1.02970536, ], - "int_1": [ + int_1=[ 19680418, 75337055, 99973684, @@ -500,7 +500,7 @@ def test_blocks_compat_GH9037(self): 41903419, 16008365, ], - "str_1": [ + str_1=[ "78c608f1", "64a99743", "13d2ff52", @@ -512,7 +512,7 @@ def test_blocks_compat_GH9037(self): "7a669144", "8d64d068", ], - "float_2": [ + float_2=[ -0.0428278, -1.80872357, 3.36042349, @@ -524,7 +524,7 @@ def test_blocks_compat_GH9037(self): -0.03030452, 1.43366348, ], - "str_2": [ + str_2=[ "14f04af9", "d085da90", "4bcfac83", @@ -536,7 +536,7 @@ def test_blocks_compat_GH9037(self): "1f6a09ba", "4bfc4d87", ], - "int_2": [ + int_2=[ 86967717, 98098830, 51927505, @@ -548,7 +548,7 @@ def test_blocks_compat_GH9037(self): 24867120, 76131025, ], - }, + ), index=index, ) @@ -727,7 +727,9 @@ def test_series_with_dtype_datetime(self, dtype, expected): def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) result = read_json(df.to_json(), precise_float=True) - tm.assert_frame_equal(result, df) + tm.assert_frame_equal( + result, df, check_index_type=False, check_column_type=False + ) def test_typ(self): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 4bbd81ada995b..2e68d3306c7d1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -77,8 +77,8 @@ def test_readjson_chunks(lines_json_df, chunksize): # GH17048: memory usage when lines=True unchunked = read_json(StringIO(lines_json_df), lines=True) - with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader: - chunked = pd.concat(reader) + reader = read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) + chunked = pd.concat(reader) tm.assert_frame_equal(chunked, unchunked) @@ -86,8 +86,7 @@ def test_readjson_chunks(lines_json_df, chunksize): def test_readjson_chunksize_requires_lines(lines_json_df): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: - pass + pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) def test_readjson_chunks_series(): @@ -98,8 +97,7 @@ def test_readjson_chunks_series(): unchunked = pd.read_json(strio, lines=True, typ="Series") strio = StringIO(s.to_json(lines=True, orient="records")) - with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader: - chunked = pd.concat(reader) + chunked = pd.concat(pd.read_json(strio, lines=True, typ="Series", chunksize=1)) tm.assert_series_equal(chunked, unchunked) @@ -107,8 +105,7 @@ def test_readjson_chunks_series(): def test_readjson_each_chunk(lines_json_df): # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. - with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: - chunks = list(reader) + chunks = list(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) @@ -117,8 +114,7 @@ def test_readjson_chunks_from_file(): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - with pd.read_json(path, lines=True, chunksize=1) as reader: - chunked = pd.concat(reader) + chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) unchunked = pd.read_json(path, lines=True) tm.assert_frame_equal(unchunked, chunked) @@ -145,8 +141,7 @@ def test_readjson_chunks_closes(chunksize): compression=None, nrows=None, ) - with reader: - reader.read() + reader.read() assert ( reader.handles.handle.closed ), f"didn't close stream with chunksize = {chunksize}" @@ -157,10 +152,7 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - with pd.read_json( - StringIO(lines_json_df), lines=True, chunksize=chunksize - ) as _: - pass + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) @pytest.mark.parametrize("chunksize", [None, 1, 2]) @@ -184,8 +176,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: - with test: - test = pd.concat(test) + test = pd.concat(test) tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") @@ -221,8 +212,8 @@ def test_readjson_nrows_chunks(nrows, chunksize): {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader: - chunked = pd.concat(reader) + reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) + chunked = pd.concat(reader) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) @@ -249,6 +240,6 @@ def test_readjson_lines_chunks_fileurl(datapath): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: - for index, chuck in enumerate(url_reader): - tm.assert_frame_equal(chuck, df_list_expected[index]) + url_reader = pd.read_json(file_url, lines=True, chunksize=1) + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index]) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index ced0d540f33ef..086c0b7ba08b2 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -757,10 +757,10 @@ def test_array_reshaped(self, shape): def test_array_list(self): arr_list = [ "a", - [], - {}, - {}, - [], + list(), + dict(), + dict(), + list(), 42, 97.8, ["a", "b"], @@ -797,9 +797,9 @@ def test_0d_array(self): ([42, {}, "a"], TypeError, {}), ([42, ["a"], 42], ValueError, {}), (["a", "b", [], "c"], ValueError, {}), - ([{"a": "b"}], ValueError, {"labelled": True}), - ({"a": {"b": {"c": 42}}}, ValueError, {"labelled": True}), - ([{"a": 42, "b": 23}, {"c": 17}], ValueError, {"labelled": True}), + ([{"a": "b"}], ValueError, dict(labelled=True)), + ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), + ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)), ], ) def test_array_numpy_except(self, bad_input, exc_type, kwargs): @@ -852,8 +852,8 @@ def test_dataframe(self, orient, numpy): columns=["x", "y", "z"], dtype=dtype, ) - encode_kwargs = {} if orient is None else {"orient": orient} - decode_kwargs = {} if numpy is None else {"numpy": numpy} + encode_kwargs = {} if orient is None else dict(orient=orient) + decode_kwargs = {} if numpy is None else dict(numpy=numpy) assert (df.dtypes == dtype).all() output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) @@ -884,7 +884,7 @@ def test_dataframe_nested(self, orient): ) nested = {"df1": df, "df2": df.copy()} - kwargs = {} if orient is None else {"orient": orient} + kwargs = {} if orient is None else dict(orient=orient) exp = { "df1": ujson.decode(ujson.encode(df, **kwargs)), @@ -902,7 +902,7 @@ def test_dataframe_numpy_labelled(self, orient): columns=["x", "y", "z"], dtype=int, ) - kwargs = {} if orient is None else {"orient": orient} + kwargs = {} if orient is None else dict(orient=orient) output = DataFrame( *ujson.decode(ujson.encode(df, **kwargs), numpy=True, labelled=True) @@ -925,8 +925,8 @@ def test_series(self, orient, numpy): ).sort_values() assert s.dtype == dtype - encode_kwargs = {} if orient is None else {"orient": orient} - decode_kwargs = {} if numpy is None else {"numpy": numpy} + encode_kwargs = {} if orient is None else dict(orient=orient) + decode_kwargs = {} if numpy is None else dict(numpy=numpy) output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) assert s.dtype == dtype @@ -953,7 +953,7 @@ def test_series_nested(self, orient): [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] ).sort_values() nested = {"s1": s, "s2": s.copy()} - kwargs = {} if orient is None else {"orient": orient} + kwargs = {} if orient is None else dict(orient=orient) exp = { "s1": ujson.decode(ujson.encode(s, **kwargs)), diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index e8893b4c02238..d03c85f65ea8d 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -13,7 +13,7 @@ class BaseParser: def update_kwargs(self, kwargs): kwargs = kwargs.copy() - kwargs.update({"engine": self.engine, "low_memory": self.low_memory}) + kwargs.update(dict(engine=self.engine, low_memory=self.low_memory)) return kwargs diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 06ccfa7f62863..eee111dd4579c 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -376,10 +376,10 @@ def test_parse_trim_buffers(c_parser_only): ) # Iterate over the CSV file in chunks of `chunksize` lines - with parser.read_csv( + chunks_ = parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) + ) + result = concat(chunks_, axis=0, ignore_index=True) # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) @@ -387,14 +387,14 @@ def test_parse_trim_buffers(c_parser_only): # This extra test was added to replicate the fault in gh-5291. # Force 'utf-8' encoding, so that `_string_convert` would take # a different execution branch. - with parser.read_csv( + chunks_ = parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize, encoding="utf_8", - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) + ) + result = concat(chunks_, axis=0, ignore_index=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c8ed0d75b13a2..8f63d06859f62 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -195,11 +195,12 @@ def test_malformed_chunks(all_parsers, nrows): """ parser = all_parsers msg = "Expected 3 fields in line 6, saw 5" - with parser.read_csv( + reader = parser.read_csv( StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] - ) as reader: - with pytest.raises(ParserError, match=msg): - reader.read(nrows) + ) + + with pytest.raises(ParserError, match=msg): + reader.read(nrows) def test_unnamed_columns(all_parsers): @@ -470,6 +471,7 @@ def test_read_chunksize_with_index(all_parsers, index_col): bar2,12,13,14,15 """ + reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) expected = DataFrame( [ ["foo", 2, 3, 4, 5], @@ -483,8 +485,7 @@ def test_read_chunksize_with_index(all_parsers, index_col): ) expected = expected.set_index("index") - with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: - chunks = list(reader) + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) tm.assert_frame_equal(chunks[2], expected[4:]) @@ -504,8 +505,7 @@ def test_read_chunksize_bad(all_parsers, chunksize): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), chunksize=chunksize) as _: - pass + parser.read_csv(StringIO(data), chunksize=chunksize) @pytest.mark.parametrize("chunksize", [2, 8]) @@ -522,9 +522,9 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): parser = all_parsers kwargs = dict(index_col=0, nrows=5) + reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), expected) + tm.assert_frame_equal(concat(reader), expected) def test_read_chunksize_and_nrows_changing_size(all_parsers): @@ -539,13 +539,14 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): parser = all_parsers kwargs = dict(index_col=0, nrows=5) + reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: - tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) - tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) - with pytest.raises(StopIteration, match=""): - reader.get_chunk(size=3) + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) + + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) def test_get_chunk_passed_chunksize(all_parsers): @@ -556,8 +557,8 @@ def test_get_chunk_passed_chunksize(all_parsers): 7,8,9 1,2,3""" - with parser.read_csv(StringIO(data), chunksize=2) as reader: - result = reader.get_chunk() + reader = parser.read_csv(StringIO(data), chunksize=2) + result = reader.get_chunk() expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -575,9 +576,10 @@ def test_read_chunksize_compat(all_parsers, kwargs): bar2,12,13,14,15 """ parser = all_parsers + reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs) + result = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), result) + tm.assert_frame_equal(concat(reader), result) def test_read_chunksize_jagged_names(all_parsers): @@ -586,8 +588,9 @@ def test_read_chunksize_jagged_names(all_parsers): data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) - with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: - result = concat(reader) + reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4) + + result = concat(reader) tm.assert_frame_equal(result, expected) @@ -599,8 +602,8 @@ def test_read_data_list(all_parsers): data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] expected = parser.read_csv(StringIO(data), **kwargs) - with TextParser(data_list, chunksize=2, **kwargs) as parser: - result = parser.read() + parser = TextParser(data_list, chunksize=2, **kwargs) + result = parser.read() tm.assert_frame_equal(result, expected) @@ -619,12 +622,12 @@ def test_iterator(all_parsers): kwargs = dict(index_col=0) expected = parser.read_csv(StringIO(data), **kwargs) - with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: + reader = parser.read_csv(StringIO(data), iterator=True, **kwargs) - first_chunk = reader.read(3) - tm.assert_frame_equal(first_chunk, expected[:3]) + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) - last_chunk = reader.read(5) + last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, expected[3:]) @@ -636,8 +639,8 @@ def test_iterator2(all_parsers): baz,7,8,9 """ - with parser.read_csv(StringIO(data), iterator=True) as reader: - result = list(reader) + reader = parser.read_csv(StringIO(data), iterator=True) + result = list(reader) expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -660,10 +663,10 @@ def test_reader_list(all_parsers): kwargs = dict(index_col=0) lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, **kwargs) as reader: - chunks = list(reader) + reader = TextParser(lines, chunksize=2, **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) @@ -683,10 +686,10 @@ def test_reader_list_skiprows(all_parsers): kwargs = dict(index_col=0) lines = list(csv.reader(StringIO(data))) - with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: - chunks = list(reader) + reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs) expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[1:3]) @@ -700,8 +703,8 @@ def test_iterator_stop_on_chunksize(all_parsers): baz,7,8,9 """ - with parser.read_csv(StringIO(data), chunksize=1) as reader: - result = list(reader) + reader = parser.read_csv(StringIO(data), chunksize=1) + result = list(reader) assert len(result) == 3 expected = DataFrame( @@ -721,8 +724,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): data = "a\n1\n2" with pytest.raises(ValueError, match=msg): - with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: - pass + parser.read_csv(StringIO(data), skipfooter=1, **kwargs) def test_nrows_skipfooter_errors(all_parsers): @@ -1360,8 +1362,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): data = StringIO("foo,bar\n") if iterator: - with parser.read_csv(data, chunksize=nrows) as reader: - result = next(iter(reader)) + result = next(iter(parser.read_csv(data, chunksize=nrows))) else: result = parser.read_csv(data, nrows=nrows) @@ -2055,9 +2056,10 @@ def test_read_csv_memory_growth_chunksize(all_parsers): for i in range(1000): f.write(str(i) + "\n") - with parser.read_csv(path, chunksize=20) as result: - for _ in result: - pass + result = parser.read_csv(path, chunksize=20) + + for _ in result: + pass def test_read_csv_raises_on_header_prefix(all_parsers): @@ -2308,35 +2310,3 @@ def test_memory_map_compression(all_parsers, compression): parser.read_csv(path, memory_map=True, compression=compression), expected, ) - - -def test_context_manager(all_parsers, datapath): - # make sure that opened files are closed - parser = all_parsers - - path = datapath("io", "data", "csv", "iris.csv") - - reader = parser.read_csv(path, chunksize=1) - assert not reader._engine.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert reader._engine.handles.handle.closed - - -def test_context_manageri_user_provided(all_parsers, datapath): - # make sure that user-provided handles are not closed - parser = all_parsers - - with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path: - - reader = parser.read_csv(path, chunksize=1) - assert not reader._engine.handles.handle.closed - try: - with reader: - next(reader) - assert False - except AssertionError: - assert not reader._engine.handles.handle.closed diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 220d9474c6dbf..690d3133dae5e 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -109,7 +109,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): def test_infer_compression(all_parsers, csv1, buffer, ext): # see gh-9770 parser = all_parsers - kwargs = {"index_col": 0, "parse_dates": True} + kwargs = dict(index_col=0, parse_dates=True) expected = parser.read_csv(csv1, **kwargs) kwargs["compression"] = "infer" @@ -144,7 +144,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers - compress_kwargs = {"compression": invalid_compression} + compress_kwargs = dict(compression=invalid_compression) msg = f"Unrecognized compression type: {invalid_compression}" diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 1d2fb7fddc9dd..88b400d9a11df 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -57,7 +57,7 @@ def test_converters_no_implicit_conv(all_parsers): def test_converters_euro_decimal_format(all_parsers): # see gh-583 - converters = {} + converters = dict() parser = all_parsers data = """Id;Number1;Number2;Text1;Text2;Number3 diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1e68e54b413b0..861aeba60cab7 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -213,11 +213,10 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), ] - with parser.read_csv( - StringIO(data), dtype={"b": "category"}, chunksize=2 - ) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) + actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) def test_categorical_dtype_chunksize_explicit_categories(all_parsers): @@ -236,9 +235,10 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): ), ] dtype = CategoricalDtype(cats) - with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) + actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) @pytest.mark.parametrize("ordered", [False, True]) @@ -495,7 +495,7 @@ def test_dtype_with_converters(all_parsers): (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), ( - {"a": "category", "b": "category"}, + dict(a="category", b="category"), DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), ), ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), @@ -510,7 +510,7 @@ def test_dtype_with_converters(all_parsers): ), ), ( - {"a": np.int64, "b": np.int32}, + dict(a=np.int64, b=np.int32), DataFrame( {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, index=[], diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 457a6567febab..5c4e642115798 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -11,7 +11,7 @@ import pandas._testing as tm -@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) +@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" # once it is actually supported (gh-12935) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 123dce2048a44..d50560c684084 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -2,7 +2,6 @@ Tests multithreading behaviour for reading and parsing files for each parser defined in parsers.py """ -from contextlib import ExitStack from io import BytesIO from multiprocessing.pool import ThreadPool @@ -47,18 +46,16 @@ def test_multi_thread_string_io_read_csv(all_parsers): "\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode() for _ in range(num_files) ] + files = [BytesIO(b) for b in bytes_to_df] # Read all files in many threads. - with ExitStack() as stack: - files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df] + pool = ThreadPool(8) - pool = stack.enter_context(ThreadPool(8)) + results = pool.map(parser.read_csv, files) + first_result = results[0] - results = pool.map(parser.read_csv, files) - first_result = results[0] - - for result in results: - tm.assert_frame_equal(first_result, result) + for result in results: + tm.assert_frame_equal(first_result, result) def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): @@ -119,8 +116,8 @@ def reader(arg): (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) ] - with ThreadPool(processes=num_tasks) as pool: - results = pool.map(reader, tasks) + pool = ThreadPool(processes=num_tasks) + results = pool.map(reader, tasks) header = results[0].columns diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 97f82b9a01a9a..b8b03cbd14a1d 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -122,45 +122,41 @@ def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( + df_reader = read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) + ) + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( + df_reader = read_csv( "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, engine="python", storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) + ) + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index a20ca508ebbfe..7a5203ca86520 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1020,13 +1020,13 @@ def test_multiple_date_cols_chunked(all_parsers): ) expected = expected.set_index("nominal") - with parser.read_csv( + reader = parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal", chunksize=2, - ) as reader: - chunks = list(reader) + ) + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 1af69785c7584..413b78a52ad38 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -336,10 +336,8 @@ def test_empty_field_eof(self): def test_empty_csv_input(self): # GH14867 - with read_csv( - StringIO(), chunksize=20, header=None, names=["a", "b", "c"] - ) as df: - assert isinstance(df, TextFileReader) + df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) + assert isinstance(df, TextFileReader) def assert_array_dicts_equal(left, right): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f8d283f622d4d..afd2f56efb935 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -112,7 +112,7 @@ def roundtrip(key, obj, **kwargs): tm.assert_frame_equal(o, roundtrip("frame", o)) # table - df = DataFrame({"A": range(5), "B": range(5)}) + df = DataFrame(dict(A=range(5), B=range(5))) df.to_hdf(path, "table", append=True) result = read_hdf(path, "table", where=["index>2"]) tm.assert_frame_equal(df[df.index > 2], result) @@ -370,7 +370,7 @@ def test_keys_ignore_hdf_softlink(self, setup_path): with ensure_clean_store(setup_path) as store: - df = DataFrame({"A": range(5), "B": range(5)}) + df = DataFrame(dict(A=range(5), B=range(5))) store.put("df", df) assert store.keys() == ["/df"] @@ -1081,7 +1081,7 @@ def check(format, index): def test_encoding(self, setup_path): with ensure_clean_store(setup_path) as store: - df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) + df = DataFrame(dict(A="foo", B="bar"), index=range(5)) df.loc[2, "A"] = np.nan df.loc[3, "B"] = np.nan _maybe_remove(store, "df") @@ -1458,7 +1458,7 @@ def check_col(key, name, size): store.get_storer(key).table.description, name ).itemsize, size - df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) + df = DataFrame(dict(A="foo", B="bar"), index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, "df") @@ -1631,13 +1631,16 @@ def check_col(key, name, size): & (df_new.A > 0) & (df_new.B < 0) ] - tm.assert_frame_equal(result, expected, check_freq=False) - # FIXME: 2020-05-07 freq check randomly fails in the CI + tm.assert_frame_equal( + result, expected, check_index_type=False, check_freq=False + ) # yield an empty frame result = store.select("df", "string='foo' and string2='cool'") expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal( + result, expected, check_index_type=False, check_freq=False + ) with ensure_clean_store(setup_path) as store: # doc example @@ -1657,11 +1660,16 @@ def check_col(key, name, size): result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal( + result, expected, check_index_type=False, check_freq=False + ) result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal( + result, expected, check_index_type=False, check_freq=False + ) + # FIXME: 2020-05-07 freq check randomly fails in the CI with ensure_clean_store(setup_path) as store: # doc example part 2 @@ -2188,13 +2196,13 @@ def test_append_with_timedelta(self, setup_path): # append timedelta df = DataFrame( - { - "A": Timestamp("20130101"), - "B": [ + dict( + A=Timestamp("20130101"), + B=[ Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) ], - } + ) ) df["C"] = df["A"] - df["B"] df.loc[3:5, "C"] = np.nan @@ -2366,7 +2374,9 @@ def test_series(self, setup_path): ts3 = Series( ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object) ) - self._check_roundtrip(ts3, tm.assert_series_equal, path=setup_path) + self._check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) def test_float_index(self, setup_path): @@ -2732,10 +2742,7 @@ def test_select_dtypes(self, setup_path): with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - } + dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) ) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A"]) @@ -2763,7 +2770,7 @@ def test_select_dtypes(self, setup_path): tm.assert_frame_equal(expected, result) # integer index - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) _maybe_remove(store, "df_int") store.append("df_int", df) result = store.select("df_int", "index<10 and columns=['A']") @@ -2772,11 +2779,11 @@ def test_select_dtypes(self, setup_path): # float index df = DataFrame( - { - "A": np.random.rand(20), - "B": np.random.rand(20), - "index": np.arange(20, dtype="f8"), - } + dict( + A=np.random.rand(20), + B=np.random.rand(20), + index=np.arange(20, dtype="f8"), + ) ) _maybe_remove(store, "df_float") store.append("df_float", df) @@ -2787,7 +2794,7 @@ def test_select_dtypes(self, setup_path): with ensure_clean_store(setup_path) as store: # floats w/o NaN - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) store.append("df1", df, data_columns=True) @@ -2811,7 +2818,7 @@ def test_select_dtypes(self, setup_path): # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too - df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64") + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) df.iloc[1] = np.nan @@ -2838,15 +2845,15 @@ def test_select_with_many_inputs(self, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( - { - "ts": bdate_range("2012-01-01", periods=300), - "A": np.random.randn(300), - "B": range(300), - "users": ["a"] * 50 + dict( + ts=bdate_range("2012-01-01", periods=300), + A=np.random.randn(300), + B=range(300), + users=["a"] * 50 + ["b"] * 50 + ["c"] * 100 + [f"a{i:03d}" for i in range(100)], - } + ) ) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A", "B", "users"]) @@ -3142,7 +3149,7 @@ def test_retain_index_attributes(self, setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) ) with ensure_clean_store(setup_path) as store: @@ -3161,11 +3168,11 @@ def test_retain_index_attributes(self, setup_path): # try to append a table with a different frequency with catch_warnings(record=True): df2 = DataFrame( - { - "A": Series( + dict( + A=Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) - } + ) ) store.append("data", df2) @@ -3174,8 +3181,8 @@ def test_retain_index_attributes(self, setup_path): # this is ok _maybe_remove(store, "df2") df2 = DataFrame( - { - "A": Series( + dict( + A=Series( range(3), index=[ Timestamp("20010101"), @@ -3183,15 +3190,15 @@ def test_retain_index_attributes(self, setup_path): Timestamp("20020101"), ], ) - } + ) ) store.append("df2", df2) df3 = DataFrame( - { - "A": Series( + dict( + A=Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) - } + ) ) store.append("df2", df3) @@ -3204,26 +3211,25 @@ def test_retain_index_attributes2(self, setup_path): with catch_warnings(record=True): df = DataFrame( - { - "A": Series( + dict( + A=Series( range(3), index=date_range("2000-1-1", periods=3, freq="H") ) - } + ) ) df.to_hdf(path, "data", mode="w", append=True) df2 = DataFrame( - { - "A": Series( + dict( + A=Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) - } + ) ) - df2.to_hdf(path, "data", append=True) idx = date_range("2000-1-1", periods=3, freq="H") idx.name = "foo" - df = DataFrame({"A": Series(range(3), index=idx)}) + df = DataFrame(dict(A=Series(range(3), index=idx))) df.to_hdf(path, "data", mode="w", append=True) assert read_hdf(path, "data").index.name == "foo" @@ -3232,7 +3238,7 @@ def test_retain_index_attributes2(self, setup_path): idx2 = date_range("2001-1-1", periods=3, freq="H") idx2.name = "bar" - df2 = DataFrame({"A": Series(range(3), index=idx2)}) + df2 = DataFrame(dict(A=Series(range(3), index=idx2))) df2.to_hdf(path, "data", append=True) assert read_hdf(path, "data").index.name is None @@ -3533,7 +3539,7 @@ def test_coordinates(self, setup_path): # get coordinates back & test vs frame _maybe_remove(store, "df") - df = DataFrame({"A": range(5), "B": range(5)}) + df = DataFrame(dict(A=range(5), B=range(5))) store.append("df", df) c = store.select_as_coordinates("df", ["index<3"]) assert (c.values == np.arange(3)).all() @@ -3795,12 +3801,12 @@ def test_nan_selection_bug_4858(self, setup_path): with ensure_clean_store(setup_path) as store: - df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64") + df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) df.iloc[0] = np.nan expected = DataFrame( - {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]}, + dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), index=[3, 4, 5], ) @@ -3814,7 +3820,7 @@ def test_start_stop_table(self, setup_path): with ensure_clean_store(setup_path) as store: # table - df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) store.append("df", df) result = store.select("df", "columns=['A']", start=0, stop=5) @@ -3849,7 +3855,7 @@ def test_start_stop_fixed(self, setup_path): # fixed, GH 8287 df = DataFrame( - {"A": np.random.rand(20), "B": np.random.rand(20)}, + dict(A=np.random.rand(20), B=np.random.rand(20)), index=pd.date_range("20130101", periods=20), ) store.put("df", df) @@ -4482,7 +4488,7 @@ def test_categorical_conversion(self, setup_path): data = [4.3, 9.8] # Test without categories - df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index cca62c5af59a1..9de6ca75fd4d9 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -52,22 +52,24 @@ def test_from_buffer(self): with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) - with pd.read_sas( + rdr = pd.read_sas( buf, format="sas7bdat", iterator=True, encoding="utf-8" - ) as rdr: - df = rdr.read() + ) + df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) + rdr.close() def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: - df = rdr.read(2) - tm.assert_frame_equal(df, df0.iloc[0:2, :]) - df = rdr.read(3) - tm.assert_frame_equal(df, df0.iloc[2:5, :]) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + df = rdr.read(2) + tm.assert_frame_equal(df, df0.iloc[0:2, :]) + df = rdr.read(3) + tm.assert_frame_equal(df, df0.iloc[2:5, :]) + rdr.close() def test_path_pathlib(self): for j in 0, 1: @@ -94,24 +96,25 @@ def test_iterator_loop(self): for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - with pd.read_sas(fname, chunksize=10, encoding="utf-8") as rdr: - y = 0 - for x in rdr: - y += x.shape[0] + rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") + y = 0 + for x in rdr: + y += x.shape[0] assert y == rdr.row_count + rdr.close() def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") - with pd.read_sas( - fname, format="sas7bdat", iterator=True, encoding="utf-8" - ) as rdr: - d1 = rdr.read(rdr.row_count + 20) + rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") + d1 = rdr.read(rdr.row_count + 20) + rdr.close() - with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: - d2 = rdr.read(rdr.row_count + 20) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2) + rdr.close() def test_encoding_options(datapath): diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index a8713f5bf36c9..939edb3d8e0b4 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -47,25 +47,29 @@ def test1_basic(self): num_rows = data.shape[0] # Test reading beyond end of file - with read_sas(self.file01, format="xport", iterator=True) as reader: - data = reader.read(num_rows + 100) + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(num_rows + 100) assert data.shape[0] == num_rows + reader.close() # Test incremental read with `read` method. - with read_sas(self.file01, format="xport", iterator=True) as reader: - data = reader.read(10) + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. - with read_sas(self.file01, format="xport", chunksize=10) as reader: - data = reader.get_chunk() + reader = read_sas(self.file01, format="xport", chunksize=10) + data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test read in loop m = 0 - with read_sas(self.file01, format="xport", chunksize=100) as reader: - for x in reader: - m += x.shape[0] + reader = read_sas(self.file01, format="xport", chunksize=100) + for x in reader: + m += x.shape[0] + reader.close() assert m == num_rows # Read full file with `read_sas` method @@ -85,17 +89,15 @@ def test1_index(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - with read_sas( - self.file01, index="SEQN", format="xport", iterator=True - ) as reader: - data = reader.read(10) + reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) + data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. - with read_sas( - self.file01, index="SEQN", format="xport", chunksize=10 - ) as reader: - data = reader.get_chunk() + reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) + data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) def test1_incremental(self): @@ -105,8 +107,9 @@ def test1_incremental(self): data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) - with read_sas(self.file01, index="SEQN", chunksize=1000) as reader: - all_data = list(reader) + reader = read_sas(self.file01, index="SEQN", chunksize=1000) + + all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index c3b21daa0ac04..c7a7101b5fe17 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -121,16 +121,16 @@ def test_get_handle_with_buffer(self): input_buffer.close() def test_iterator(self): - with pd.read_csv(StringIO(self.data1), chunksize=1) as reader: - result = pd.concat(reader, ignore_index=True) + reader = pd.read_csv(StringIO(self.data1), chunksize=1) + result = pd.concat(reader, ignore_index=True) expected = pd.read_csv(StringIO(self.data1)) tm.assert_frame_equal(result, expected) # GH12153 - with pd.read_csv(StringIO(self.data1), chunksize=1) as it: - first = next(it) - tm.assert_frame_equal(first, expected.iloc[[0]]) - tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) + it = pd.read_csv(StringIO(self.data1), chunksize=1) + first = next(it) + tm.assert_frame_equal(first, expected.iloc[[0]]) + tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) @pytest.mark.parametrize( "reader, module, error_class, fn_ext", diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index ba8b1a8a0679d..9a883aac69e6b 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -14,15 +14,7 @@ from pandas.errors import ParserError import pandas.util._test_decorators as td -from pandas import ( - DataFrame, - MultiIndex, - Series, - Timestamp, - date_range, - read_csv, - to_datetime, -) +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, read_csv import pandas._testing as tm from pandas.io.common import file_path_to_url @@ -618,7 +610,7 @@ def try_remove_ws(x): gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew._convert(datetime=True, numeric=True) date_cols = ["Closing Date", "Updated Date"] - converted[date_cols] = converted[date_cols].apply(to_datetime) + converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fe3ca0d0937b3..3b83eed69c723 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -645,7 +645,7 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): pytest.skip() s3 = s3fs.S3FileSystem(**s3so) - kw = {"filesystem": s3} + kw = dict(filesystem=s3) check_round_trip( df_compat, pa, @@ -658,7 +658,7 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): pytest.skip() # GH #19134 - s3so = {"storage_options": s3so} + s3so = dict(storage_options=s3so) check_round_trip( df_compat, pa, @@ -710,12 +710,10 @@ def test_s3_roundtrip_for_dir( pa, expected=expected_df, path="s3://pandas-test/parquet_dir", - read_kwargs={"storage_options": s3so}, - write_kwargs={ - "partition_cols": partition_col, - "compression": None, - "storage_options": s3so, - }, + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict( + partition_cols=partition_col, compression=None, storage_options=s3so + ), check_like=True, repeat=1, ) @@ -830,35 +828,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow", min_version="0.16") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - @td.skip_if_no("pyarrow", min_version="0.14") def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so @@ -948,8 +917,8 @@ def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so): df_compat, fp, path="s3://pandas-test/fastparquet.parquet", - read_kwargs={"storage_options": s3so}, - write_kwargs={"compression": None, "storage_options": s3so}, + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict(compression=None, storage_options=s3so), ) def test_partition_cols_supported(self, fp, df_full): @@ -1032,11 +1001,3 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) - - def test_use_nullable_dtypes_not_supported(self, fp): - df = pd.DataFrame({"a": [1, 2]}) - - with tm.ensure_clean() as path: - df.to_parquet(path) - with pytest.raises(ValueError, match="not supported for the fastparquet"): - read_parquet(path, engine="fastparquet", use_nullable_dtypes=True) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 24944281419c3..b065aa187f5fb 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1974,12 +1974,12 @@ def test_iterator_value_labels(): df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) with tm.ensure_clean() as path: df.to_stata(path, write_index=False) + reader = pd.read_stata(path, chunksize=100) expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") - with pd.read_stata(path, chunksize=100) as reader: - for j, chunk in enumerate(reader): - for i in range(2): - tm.assert_index_equal(chunk.dtypes[i].categories, expected) - tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) def test_precision_loss(): diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 1f94e18d8e622..c868c8d4fba07 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,10 +1,3 @@ -""" -Module consolidating common testing functions for checking plotting. - -Currently all plotting tests are marked as slow via -``pytestmark = pytest.mark.slow`` at the module level. -""" - import os from typing import TYPE_CHECKING, Sequence, Union import warnings diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index dc7478fe6ef4a..77a4c4a8faf5e 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -21,8 +21,6 @@ from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @@ -41,6 +39,7 @@ def setup_method(self, method): } ) + @pytest.mark.slow def test_plot(self): from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0 @@ -172,11 +171,13 @@ def test_nonnumeric_exclude(self): ax = df.plot() assert len(ax.get_lines()) == 1 # B was plotted + @pytest.mark.slow def test_implicit_label(self): df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) ax = df.plot(x="a", y="b") self._check_text_labels(ax.xaxis.get_label(), "a") + @pytest.mark.slow def test_donot_overwrite_index_name(self): # GH 8494 df = DataFrame(np.random.randn(2, 2), columns=["a", "b"]) @@ -184,6 +185,7 @@ def test_donot_overwrite_index_name(self): df.plot(y="b", label="LABEL") assert df.index.name == "NAME" + @pytest.mark.slow def test_plot_xy(self): # columns.inferred_type == 'string' df = self.tdf @@ -208,6 +210,7 @@ def test_plot_xy(self): # columns.inferred_type == 'mixed' # TODO add MultiIndex test + @pytest.mark.slow @pytest.mark.parametrize( "input_log, expected_log", [(True, "log"), ("sym", "symlog")] ) @@ -236,6 +239,7 @@ def test_invalid_logscale(self, input_param): with pytest.raises(ValueError, match=msg): df.plot(**{input_param: "sm"}) + @pytest.mark.slow def test_xcompat(self): import pandas as pd @@ -456,28 +460,22 @@ def test_line_lim(self): assert xmin <= lines[0].get_data()[0][0] assert xmax >= lines[0].get_data()[0][-1] - @pytest.mark.xfail( - strict=False, - reason="2020-12-01 this has been failing periodically on the " - "ymin==0 assertion for a week or so.", - ) - @pytest.mark.parametrize("stacked", [True, False]) - def test_area_lim(self, stacked): + def test_area_lim(self): df = DataFrame(np.random.rand(6, 4), columns=["x", "y", "z", "four"]) neg_df = -df + for stacked in [True, False]: + ax = _check_plot_works(df.plot.area, stacked=stacked) + xmin, xmax = ax.get_xlim() + ymin, ymax = ax.get_ylim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data()[0][0] + assert xmax >= lines[0].get_data()[0][-1] + assert ymin == 0 - ax = _check_plot_works(df.plot.area, stacked=stacked) - xmin, xmax = ax.get_xlim() - ymin, ymax = ax.get_ylim() - lines = ax.get_lines() - assert xmin <= lines[0].get_data()[0][0] - assert xmax >= lines[0].get_data()[0][-1] - assert ymin == 0 - - ax = _check_plot_works(neg_df.plot.area, stacked=stacked) - ymin, ymax = ax.get_ylim() - assert ymax == 0 + ax = _check_plot_works(neg_df.plot.area, stacked=stacked) + ymin, ymax = ax.get_ylim() + assert ymax == 0 def test_area_sharey_dont_overwrite(self): # GH37942 @@ -490,6 +488,7 @@ def test_area_sharey_dont_overwrite(self): assert ax1._shared_y_axes.joined(ax1, ax2) assert ax2._shared_y_axes.joined(ax1, ax2) + @pytest.mark.slow def test_bar_linewidth(self): df = DataFrame(np.random.randn(5, 5)) @@ -510,6 +509,7 @@ def test_bar_linewidth(self): for r in ax.patches: assert r.get_linewidth() == 2 + @pytest.mark.slow def test_bar_barwidth(self): df = DataFrame(np.random.randn(5, 5)) @@ -547,6 +547,7 @@ def test_bar_barwidth(self): for r in ax.patches: assert r.get_height() == width + @pytest.mark.slow def test_bar_bottom_left(self): df = DataFrame(np.random.rand(5, 5)) ax = df.plot.bar(stacked=False, bottom=1) @@ -575,6 +576,7 @@ def test_bar_bottom_left(self): result = [p.get_x() for p in ax.patches] assert result == [1] * 5 + @pytest.mark.slow def test_bar_nan(self): df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]}) ax = df.plot.bar() @@ -590,6 +592,7 @@ def test_bar_nan(self): expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] assert result == expected + @pytest.mark.slow def test_bar_categorical(self): # GH 13019 df1 = DataFrame( @@ -619,6 +622,7 @@ def test_bar_categorical(self): assert ax.patches[0].get_x() == -0.25 assert ax.patches[-1].get_x() == 4.75 + @pytest.mark.slow def test_plot_scatter(self): df = DataFrame( np.random.randn(6, 4), @@ -658,23 +662,25 @@ def test_scatterplot_datetime_data(self): def test_scatterplot_object_data(self): # GH 18755 - df = DataFrame({"a": ["A", "B", "C"], "b": [2, 3, 4]}) + df = DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) _check_plot_works(df.plot.scatter, x="a", y="b") _check_plot_works(df.plot.scatter, x=0, y=1) - df = DataFrame({"a": ["A", "B", "C"], "b": ["a", "b", "c"]}) + df = DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) _check_plot_works(df.plot.scatter, x="a", y="b") _check_plot_works(df.plot.scatter, x=0, y=1) @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) + @pytest.mark.slow def test_plot_scatter_with_categorical_data(self, x, y): # after fixing GH 18755, should be able to plot categorical data df = DataFrame({"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])}) _check_plot_works(df.plot.scatter, x=x, y=y) + @pytest.mark.slow def test_plot_scatter_with_c(self): df = DataFrame( np.random.randn(6, 4), @@ -733,6 +739,7 @@ def test_plot_scatter_with_s(self): ax = df.plot.scatter(x="a", y="b", s="c") tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) + @pytest.mark.slow def test_plot_bar(self): df = DataFrame( np.random.randn(6, 4), @@ -765,6 +772,7 @@ def test_plot_bar(self): ax = df.plot.barh(rot=55, fontsize=11) self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) + @pytest.mark.slow def test_boxplot(self): df = self.hist_df series = df["height"] @@ -793,6 +801,7 @@ def test_boxplot(self): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + @pytest.mark.slow def test_boxplot_vertical(self): df = self.hist_df numeric_cols = df._get_numeric_data().columns @@ -823,6 +832,7 @@ def test_boxplot_vertical(self): tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + @pytest.mark.slow def test_boxplot_return_type(self): df = DataFrame( np.random.randn(6, 4), @@ -844,6 +854,7 @@ def test_boxplot_return_type(self): result = df.plot.box(return_type="both") self._check_box_return_type(result, "both") + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_df(self): df = DataFrame(np.random.randn(100, 4)) @@ -866,12 +877,14 @@ def test_kde_df(self): axes = df.plot(kind="kde", logy=True, subplots=True) self._check_ax_scales(axes, yaxis="log") + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_missing_vals(self): df = DataFrame(np.random.uniform(size=(100, 4))) df.loc[0, 0] = np.nan _check_plot_works(df.plot, kind="kde") + @pytest.mark.slow def test_hist_df(self): from matplotlib.patches import Rectangle @@ -953,6 +966,7 @@ def _check_box_coord( if expected_w is not None: tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) + @pytest.mark.slow def test_hist_df_coord(self): normal_df = DataFrame( { @@ -1084,10 +1098,12 @@ def test_hist_df_coord(self): expected_w=np.array([6, 7, 8, 9, 10]), ) + @pytest.mark.slow def test_plot_int_columns(self): df = DataFrame(np.random.randn(100, 4)).cumsum() _check_plot_works(df.plot, legend=True) + @pytest.mark.slow def test_df_legend_labels(self): kinds = ["line", "bar", "barh", "kde", "area", "hist"] df = DataFrame(np.random.rand(3, 3), columns=["a", "b", "c"]) @@ -1201,6 +1217,7 @@ def test_legend_name(self): leg_title = ax.legend_.get_title() self._check_text_labels(leg_title, "new") + @pytest.mark.slow def test_no_legend(self): kinds = ["line", "bar", "barh", "kde", "area", "hist"] df = DataFrame(np.random.rand(3, 3), columns=["a", "b", "c"]) @@ -1209,6 +1226,7 @@ def test_no_legend(self): ax = df.plot(kind=kind, legend=False) self._check_legend_labels(ax, visible=False) + @pytest.mark.slow def test_style_by_column(self): import matplotlib.pyplot as plt @@ -1227,6 +1245,7 @@ def test_style_by_column(self): for idx, line in enumerate(ax.get_lines()[: len(markers)]): assert line.get_marker() == markers[idx] + @pytest.mark.slow def test_line_label_none(self): s = Series([1, 2]) ax = s.plot() @@ -1247,7 +1266,7 @@ def test_line_label_none(self): def test_specified_props_kwd_plot_box(self, props, expected): # GH 30346 df = DataFrame({k: np.random.random(100) for k in "ABC"}) - kwd = {props: {"color": "C1"}} + kwd = {props: dict(color="C1")} result = df.plot.box(return_type="dict", **kwd) assert result[expected][0].get_color() == "C1" @@ -1283,6 +1302,7 @@ def test_all_invalid_plot_data(self): with pytest.raises(TypeError, match=msg): df.plot(kind=kind) + @pytest.mark.slow def test_partially_invalid_plot_data(self): with tm.RNGContext(42): df = DataFrame(np.random.randn(10, 2), dtype=object) @@ -1352,6 +1372,7 @@ def test_xy_args_integer(self, x, y, colnames): df.columns = colnames _check_plot_works(df.plot, x=x, y=y) + @pytest.mark.slow def test_hexbin_basic(self): df = self.hexbin_df @@ -1367,6 +1388,7 @@ def test_hexbin_basic(self): # return value is single axes self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + @pytest.mark.slow def test_hexbin_with_c(self): df = self.hexbin_df @@ -1376,6 +1398,7 @@ def test_hexbin_with_c(self): ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std) assert len(ax.collections) == 1 + @pytest.mark.slow @pytest.mark.parametrize( "kwargs, expected", [ @@ -1389,6 +1412,7 @@ def test_hexbin_cmap(self, kwargs, expected): ax = df.plot.hexbin(x="A", y="B", **kwargs) assert ax.collections[0].cmap.name == expected + @pytest.mark.slow def test_pie_df(self): df = DataFrame( np.random.rand(5, 3), @@ -1460,6 +1484,7 @@ def test_pie_df_nan(self): expected_labels = base_expected[:i] + base_expected[i + 1 :] assert result_labels == expected_labels + @pytest.mark.slow def test_errorbar_plot(self): d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) @@ -1506,6 +1531,7 @@ def test_errorbar_plot(self): with pytest.raises((ValueError, TypeError)): df.plot(yerr=df_err) + @pytest.mark.slow @pytest.mark.parametrize("kind", ["line", "bar", "barh"]) def test_errorbar_plot_different_kinds(self, kind): d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} @@ -1539,6 +1565,7 @@ def test_errorbar_plot_different_kinds(self, kind): self._check_has_errorbars(axes, xerr=1, yerr=1) @pytest.mark.xfail(reason="Iterator is consumed", raises=ValueError) + @pytest.mark.slow def test_errorbar_plot_iterator(self): with warnings.catch_warnings(): d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} @@ -1548,6 +1575,7 @@ def test_errorbar_plot_iterator(self): ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) self._check_has_errorbars(ax, xerr=0, yerr=2) + @pytest.mark.slow def test_errorbar_with_integer_column_names(self): # test with integer column names df = DataFrame(np.random.randn(10, 2)) @@ -1557,6 +1585,7 @@ def test_errorbar_with_integer_column_names(self): ax = _check_plot_works(df.plot, y=0, yerr=1) self._check_has_errorbars(ax, xerr=0, yerr=1) + @pytest.mark.slow def test_errorbar_with_partial_columns(self): df = DataFrame(np.random.randn(10, 3)) df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) @@ -1579,6 +1608,7 @@ def test_errorbar_with_partial_columns(self): ax = _check_plot_works(df.plot, yerr=err) self._check_has_errorbars(ax, xerr=0, yerr=1) + @pytest.mark.slow @pytest.mark.parametrize("kind", ["line", "bar", "barh"]) def test_errorbar_timeseries(self, kind): d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} @@ -1683,6 +1713,7 @@ def _check_errorbar_color(containers, expected, has_err="has_xerr"): self._check_has_errorbars(ax, xerr=0, yerr=1) _check_errorbar_color(ax.containers, "green", has_err="has_yerr") + @pytest.mark.slow def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas @@ -1737,6 +1768,7 @@ def _check(axes): self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() + @pytest.mark.slow def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas @@ -1822,6 +1854,7 @@ def test_memory_leak(self): # need to actually access something to get an error results[key].lines + @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 import matplotlib.gridspec as gridspec @@ -1937,6 +1970,7 @@ def _get_boxed_grid(): self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() + @pytest.mark.slow def test_df_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( @@ -1988,10 +2022,11 @@ def test_secondary_axis_font_size(self, method): fontsize = 20 sy = ["C", "D"] - kwargs = {"secondary_y": sy, "fontsize": fontsize, "mark_right": True} + kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True) ax = getattr(df.plot, method)(**kwargs) self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) + @pytest.mark.slow def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 @@ -2011,6 +2046,7 @@ def test_x_string_values_ticks(self): assert labels_position["Tuesday"] == 1.0 assert labels_position["Wednesday"] == 2.0 + @pytest.mark.slow def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 @@ -2154,6 +2190,7 @@ def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel): assert ax.get_xlabel() == (xcol if xlabel is None else xlabel) assert ax.get_ylabel() == (ycol if ylabel is None else ylabel) + @pytest.mark.slow @pytest.mark.parametrize("method", ["bar", "barh"]) def test_bar_ticklabel_consistence(self, method): # Draw two consecutiv bar plot with consistent ticklabels diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index bc64014cdb6d4..d9fe7363a15ad 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -12,8 +12,6 @@ import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestDataFrameColor(TestPlotBase): @@ -100,6 +98,7 @@ def test_color_and_marker(self, color, expected): assert all(i.get_linestyle() == "--" for i in ax.lines) assert all(i.get_marker() == "d" for i in ax.lines) + @pytest.mark.slow def test_bar_colors(self): import matplotlib.pyplot as plt @@ -153,6 +152,7 @@ def test_bar_user_colors(self): ] assert result == expected + @pytest.mark.slow def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # addressing issue #10611, to ensure colobar does not # interfere with x-axis label and ticklabels with @@ -175,6 +175,7 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() ) + @pytest.mark.slow def test_if_hexbin_xaxis_label_is_visible(self): # addressing issue #10678, to ensure colobar does not # interfere with x-axis label and ticklabels with @@ -187,6 +188,7 @@ def test_if_hexbin_xaxis_label_is_visible(self): assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) assert ax.xaxis.get_label().get_visible() + @pytest.mark.slow def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt @@ -248,6 +250,7 @@ def test_scatter_colorbar_different_cmap(self): assert ax.collections[0].cmap.name == "cividis" assert ax.collections[1].cmap.name == "magma" + @pytest.mark.slow def test_line_colors(self): from matplotlib import cm @@ -292,11 +295,13 @@ def test_line_colors(self): self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() + @pytest.mark.slow def test_dont_modify_colors(self): colors = ["r", "g", "b"] DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 + @pytest.mark.slow def test_line_colors_and_styles_subplots(self): # GH 9894 from matplotlib import cm @@ -365,6 +370,7 @@ def test_line_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() + @pytest.mark.slow def test_area_colors(self): from matplotlib import cm from matplotlib.collections import PolyCollection @@ -409,6 +415,7 @@ def test_area_colors(self): for h in handles: assert h.get_alpha() == 0.5 + @pytest.mark.slow def test_hist_colors(self): default_colors = self._unpack_cycler(self.plt.rcParams) @@ -443,6 +450,7 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=["green"] * 5) tm.close() + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_colors(self): from matplotlib import cm @@ -463,6 +471,7 @@ def test_kde_colors(self): rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=rgba_colors) + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_colors_and_styles_subplots(self): from matplotlib import cm @@ -519,6 +528,7 @@ def test_kde_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() + @pytest.mark.slow def test_boxplot_colors(self): def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): # TODO: outside this func? @@ -541,12 +551,9 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) tm.close() - dict_colors = { - "boxes": "#572923", - "whiskers": "#982042", - "medians": "#804823", - "caps": "#123456", - } + dict_colors = dict( + boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" + ) bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") _check_colors( bp, @@ -559,7 +566,7 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): tm.close() # partial colors - dict_colors = {"whiskers": "c", "medians": "m"} + dict_colors = dict(whiskers="c", medians="m") bp = df.plot.box(color=dict_colors, return_type="dict") _check_colors(bp, default_colors[0], "c", "m") tm.close() @@ -587,7 +594,7 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): with pytest.raises(ValueError): # Color contains invalid key results in ValueError - df.plot.box(color={"boxes": "red", "xxxx": "blue"}) + df.plot.box(color=dict(boxes="red", xxxx="blue")) def test_default_color_cycle(self): import cycler @@ -602,11 +609,13 @@ def test_default_color_cycle(self): expected = self._unpack_cycler(plt.rcParams)[:3] self._check_colors(ax.get_lines(), linecolors=expected) + @pytest.mark.slow def test_no_color_bar(self): df = self.hexbin_df ax = df.plot.hexbin(x="A", y="B", colorbar=None) assert ax.collections[0].colorbar is None + @pytest.mark.slow def test_mixing_cmap_and_colormap_raises(self): df = self.hexbin_df msg = "Only specify one of `cmap` and `colormap`" diff --git a/pandas/tests/plotting/frame/test_frame_groupby.py b/pandas/tests/plotting/frame/test_frame_groupby.py index bc35e02e6a581..9c1676d6d97fb 100644 --- a/pandas/tests/plotting/frame/test_frame_groupby.py +++ b/pandas/tests/plotting/frame/test_frame_groupby.py @@ -9,8 +9,6 @@ import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestDataFramePlotsGroupby(TestPlotBase): diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index 427b2c1c3a180..413c5b8a87dc7 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -15,8 +15,6 @@ from pandas.io.formats.printing import pprint_thing -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestDataFramePlotsSubplots(TestPlotBase): @@ -35,6 +33,7 @@ def setup_method(self, method): } ) + @pytest.mark.slow def test_subplots(self): df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) @@ -73,6 +72,7 @@ def test_subplots(self): for ax in axes: assert ax.get_legend() is None + @pytest.mark.slow def test_subplots_timeseries(self): idx = date_range(start="2014-07-01", freq="M", periods=10) df = DataFrame(np.random.rand(10, 3), index=idx) @@ -190,6 +190,7 @@ def test_subplots_timeseries_y_axis_not_supported(self): == testdata["datetime_mixed_tz"].values ).all() + @pytest.mark.slow def test_subplots_layout_multi_column(self): # GH 6667 df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) @@ -223,6 +224,7 @@ def test_subplots_layout_multi_column(self): with pytest.raises(ValueError): df.plot(subplots=True, layout=(-1, -1)) + @pytest.mark.slow @pytest.mark.parametrize( "kwargs, expected_axes_num, expected_layout, expected_shape", [ @@ -244,6 +246,7 @@ def test_subplots_layout_single_column( ) assert axes.shape == expected_shape + @pytest.mark.slow def test_subplots_warnings(self): # GH 9464 with tm.assert_produces_warning(None): @@ -255,6 +258,7 @@ def test_subplots_warnings(self): ) df.plot(subplots=True, layout=(3, 2)) + @pytest.mark.slow def test_subplots_multiple_axes(self): # GH 5353, 6970, GH 7069 fig, axes = self.plt.subplots(2, 3) @@ -354,6 +358,7 @@ def test_subplots_sharex_axes_existing_axes(self): for ax in axes.ravel(): self._check_visible(ax.get_yticklabels(), visible=True) + @pytest.mark.slow def test_subplots_dup_columns(self): # GH 10962 df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) @@ -375,6 +380,7 @@ def test_subplots_dup_columns(self): assert len(ax.lines) == 0 assert len(ax.right_ax.lines) == 5 + @pytest.mark.slow def test_bar_log_no_subplots(self): # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 # regressions in 1.2.1 @@ -385,6 +391,7 @@ def test_bar_log_no_subplots(self): ax = df.plot.bar(grid=True, log=True) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + @pytest.mark.slow def test_bar_log_subplots(self): expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) @@ -395,6 +402,7 @@ def test_bar_log_subplots(self): tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) + @pytest.mark.slow def test_boxplot_subplots_return_type(self): df = self.hist_df @@ -414,6 +422,7 @@ def test_boxplot_subplots_return_type(self): check_ax_title=False, ) + @pytest.mark.slow def test_df_subplots_patterns_minorticks(self): # GH 10657 import matplotlib.pyplot as plt @@ -504,37 +513,38 @@ def test_xlabel_ylabel_dataframe_subplots( assert all(ax.get_ylabel() == str(new_label) for ax in axes) assert all(ax.get_xlabel() == str(new_label) for ax in axes) + @pytest.mark.slow @pytest.mark.parametrize( "kwargs", [ # stacked center - {"kind": "bar", "stacked": True}, - {"kind": "bar", "stacked": True, "width": 0.9}, - {"kind": "barh", "stacked": True}, - {"kind": "barh", "stacked": True, "width": 0.9}, + dict(kind="bar", stacked=True), + dict(kind="bar", stacked=True, width=0.9), + dict(kind="barh", stacked=True), + dict(kind="barh", stacked=True, width=0.9), # center - {"kind": "bar", "stacked": False}, - {"kind": "bar", "stacked": False, "width": 0.9}, - {"kind": "barh", "stacked": False}, - {"kind": "barh", "stacked": False, "width": 0.9}, + dict(kind="bar", stacked=False), + dict(kind="bar", stacked=False, width=0.9), + dict(kind="barh", stacked=False), + dict(kind="barh", stacked=False, width=0.9), # subplots center - {"kind": "bar", "subplots": True}, - {"kind": "bar", "subplots": True, "width": 0.9}, - {"kind": "barh", "subplots": True}, - {"kind": "barh", "subplots": True, "width": 0.9}, + dict(kind="bar", subplots=True), + dict(kind="bar", subplots=True, width=0.9), + dict(kind="barh", subplots=True), + dict(kind="barh", subplots=True, width=0.9), # align edge - {"kind": "bar", "stacked": True, "align": "edge"}, - {"kind": "bar", "stacked": True, "width": 0.9, "align": "edge"}, - {"kind": "barh", "stacked": True, "align": "edge"}, - {"kind": "barh", "stacked": True, "width": 0.9, "align": "edge"}, - {"kind": "bar", "stacked": False, "align": "edge"}, - {"kind": "bar", "stacked": False, "width": 0.9, "align": "edge"}, - {"kind": "barh", "stacked": False, "align": "edge"}, - {"kind": "barh", "stacked": False, "width": 0.9, "align": "edge"}, - {"kind": "bar", "subplots": True, "align": "edge"}, - {"kind": "bar", "subplots": True, "width": 0.9, "align": "edge"}, - {"kind": "barh", "subplots": True, "align": "edge"}, - {"kind": "barh", "subplots": True, "width": 0.9, "align": "edge"}, + dict(kind="bar", stacked=True, align="edge"), + dict(kind="bar", stacked=True, width=0.9, align="edge"), + dict(kind="barh", stacked=True, align="edge"), + dict(kind="barh", stacked=True, width=0.9, align="edge"), + dict(kind="bar", stacked=False, align="edge"), + dict(kind="bar", stacked=False, width=0.9, align="edge"), + dict(kind="barh", stacked=False, align="edge"), + dict(kind="barh", stacked=False, width=0.9, align="edge"), + dict(kind="bar", subplots=True, align="edge"), + dict(kind="bar", subplots=True, width=0.9, align="edge"), + dict(kind="barh", subplots=True, align="edge"), + dict(kind="barh", subplots=True, width=0.9, align="edge"), ], ) def test_bar_align_multiple_columns(self, kwargs): @@ -542,21 +552,23 @@ def test_bar_align_multiple_columns(self, kwargs): df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) self._check_bar_alignment(df, **kwargs) + @pytest.mark.slow @pytest.mark.parametrize( "kwargs", [ - {"kind": "bar", "stacked": False}, - {"kind": "bar", "stacked": True}, - {"kind": "barh", "stacked": False}, - {"kind": "barh", "stacked": True}, - {"kind": "bar", "subplots": True}, - {"kind": "barh", "subplots": True}, + dict(kind="bar", stacked=False), + dict(kind="bar", stacked=True), + dict(kind="barh", stacked=False), + dict(kind="barh", stacked=True), + dict(kind="bar", subplots=True), + dict(kind="barh", subplots=True), ], ) def test_bar_align_single_column(self, kwargs): df = DataFrame(np.random.randn(5)) self._check_bar_alignment(df, **kwargs) + @pytest.mark.slow @pytest.mark.parametrize( "kwargs", [ @@ -572,6 +584,7 @@ def test_bar_barwidth_position(self, kwargs): df = DataFrame(np.random.randn(5, 5)) self._check_bar_alignment(df, width=0.9, position=0.2, **kwargs) + @pytest.mark.slow def test_bar_barwidth_position_int(self): # GH 12979 df = DataFrame(np.random.randn(5, 5)) diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 567d159f723a5..9025f8c361a82 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -12,9 +12,6 @@ setattr(dummy_backend, "plot", lambda *args, **kwargs: "used_dummy") -pytestmark = pytest.mark.slow - - @pytest.fixture def restore_backend(): """Restore the plotting backend to matplotlib""" diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 54a40afd019c3..9e1a8d473b9d6 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,3 @@ -""" Test cases for .boxplot method """ - import itertools import string @@ -14,11 +12,12 @@ import pandas.plotting as plotting -pytestmark = pytest.mark.slow +""" Test cases for .boxplot method """ @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): + @pytest.mark.slow def test_boxplot_legacy1(self): df = DataFrame( np.random.randn(6, 4), @@ -43,6 +42,7 @@ def test_boxplot_legacy1(self): with tm.assert_produces_warning(UserWarning): _check_plot_works(df.boxplot, by="indic", notch=1) + @pytest.mark.slow def test_boxplot_legacy2(self): df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) @@ -77,11 +77,13 @@ def test_boxplot_legacy2(self): lines = list(itertools.chain.from_iterable(d.values())) assert len(ax.get_lines()) == len(lines) + @pytest.mark.slow def test_boxplot_return_type_none(self): # GH 12216; return_type=None & by=None -> axes result = self.hist_df.boxplot() assert isinstance(result, self.plt.Axes) + @pytest.mark.slow def test_boxplot_return_type_legacy(self): # API change in https://github.com/pandas-dev/pandas/pull/7096 import matplotlib as mpl # noqa @@ -109,6 +111,7 @@ def test_boxplot_return_type_legacy(self): result = df.boxplot(return_type="both") self._check_box_return_type(result, "both") + @pytest.mark.slow def test_boxplot_axis_limits(self): def _check_ax_limits(col, ax): y_min, y_max = ax.get_ylim() @@ -135,11 +138,13 @@ def _check_ax_limits(col, ax): assert age_ax._sharey == height_ax assert dummy_ax._sharey is None + @pytest.mark.slow def test_boxplot_empty_column(self): df = DataFrame(np.random.randn(20, 4)) df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type="axes") + @pytest.mark.slow def test_figsize(self): df = DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) result = df.boxplot(return_type="axes", figsize=(12, 8)) @@ -171,11 +176,11 @@ def test_boxplot_numeric_data(self): "colors_kwd, expected", [ ( - {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"}, - {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"}, + dict(boxes="r", whiskers="b", medians="g", caps="c"), + dict(boxes="r", whiskers="b", medians="g", caps="c"), ), - ({"boxes": "r"}, {"boxes": "r"}), - ("r", {"boxes": "r", "whiskers": "r", "medians": "r", "caps": "r"}), + (dict(boxes="r"), dict(boxes="r")), + ("r", dict(boxes="r", whiskers="r", medians="r", caps="r")), ], ) def test_color_kwd(self, colors_kwd, expected): @@ -187,7 +192,7 @@ def test_color_kwd(self, colors_kwd, expected): @pytest.mark.parametrize( "dict_colors, msg", - [({"boxes": "r", "invalid_key": "r"}, "invalid key 'invalid_key'")], + [(dict(boxes="r", invalid_key="r"), "invalid key 'invalid_key'")], ) def test_color_kwd_errors(self, dict_colors, msg): # GH: 26214 @@ -207,7 +212,7 @@ def test_color_kwd_errors(self, dict_colors, msg): def test_specified_props_kwd(self, props, expected): # GH 30346 df = DataFrame({k: np.random.random(100) for k in "ABC"}) - kwd = {props: {"color": "C1"}} + kwd = {props: dict(color="C1")} result = df.boxplot(return_type="dict", **kwd) assert result[expected][0].get_color() == "C1" @@ -215,6 +220,7 @@ def test_specified_props_kwd(self, props, expected): @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): + @pytest.mark.slow def test_boxplot_legacy1(self): grouped = self.hist_df.groupby(by="gender") with tm.assert_produces_warning(UserWarning): @@ -223,6 +229,7 @@ def test_boxplot_legacy1(self): axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + @pytest.mark.slow def test_boxplot_legacy2(self): tuples = zip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) @@ -234,6 +241,7 @@ def test_boxplot_legacy2(self): axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + @pytest.mark.slow def test_boxplot_legacy3(self): tuples = zip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) @@ -244,6 +252,7 @@ def test_boxplot_legacy3(self): axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + @pytest.mark.slow def test_grouped_plot_fignums(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) @@ -267,6 +276,7 @@ def test_grouped_plot_fignums(self): res = df.groupby("gender").hist() tm.close() + @pytest.mark.slow def test_grouped_box_return_type(self): df = self.hist_df @@ -301,6 +311,7 @@ def test_grouped_box_return_type(self): returned = df2.boxplot(by="category", return_type=t) self._check_box_return_type(returned, t, expected_keys=columns2) + @pytest.mark.slow def test_grouped_box_layout(self): df = self.hist_df @@ -394,6 +405,7 @@ def test_grouped_box_layout(self): ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) + @pytest.mark.slow def test_grouped_box_multiple_axes(self): # GH 6970, GH 7069 df = self.hist_df diff --git a/pandas/tests/plotting/test_common.py b/pandas/tests/plotting/test_common.py index 2664dc8e1b090..af67ed7ec215b 100644 --- a/pandas/tests/plotting/test_common.py +++ b/pandas/tests/plotting/test_common.py @@ -5,8 +5,6 @@ from pandas import DataFrame from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestCommon(TestPlotBase): diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index ae14318cdaa49..583ed040c20d5 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -31,9 +31,6 @@ dates = pytest.importorskip("matplotlib.dates") -pytestmark = pytest.mark.slow - - def test_registry_mpl_resets(): # Check that Matplotlib converters are properly reset (see issue #27481) code = ( diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 397a064f6adad..590758bc01fbb 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -18,8 +18,6 @@ from pandas.tseries.offsets import WeekOfMonth -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestTSPlot(TestPlotBase): @@ -45,6 +43,7 @@ def setup_method(self, method): def teardown_method(self, method): tm.close() + @pytest.mark.slow def test_ts_plot_with_tz(self, tz_aware_fixture): # GH2877, GH17173, GH31205, GH31580 tz = tz_aware_fixture @@ -66,6 +65,7 @@ def test_fontsize_set_correctly(self): for label in ax.get_xticklabels() + ax.get_yticklabels(): assert label.get_fontsize() == 2 + @pytest.mark.slow def test_frame_inferred(self): # inferred freq idx = date_range("1/1/1987", freq="MS", periods=100) @@ -105,6 +105,7 @@ def test_nonnumeric_exclude(self): with pytest.raises(TypeError, match=msg): df["A"].plot() + @pytest.mark.slow def test_tsplot(self): _, ax = self.plt.subplots() @@ -136,6 +137,7 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): s.plot(style="b-", color="#000099") + @pytest.mark.slow def test_high_freq(self): freaks = ["ms", "us"] for freq in freaks: @@ -152,6 +154,7 @@ def test_get_datevalue(self): assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal + @pytest.mark.slow def test_ts_plot_format_coord(self): def check_format_of_first_point(ax, expected_string): first_line = ax.get_lines()[0] @@ -176,10 +179,12 @@ def check_format_of_first_point(ax, expected_string): check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() + @pytest.mark.slow def test_line_plot_period_series(self): for s in self.period_ser: _check_plot_works(s.plot, s.index.freq) + @pytest.mark.slow @pytest.mark.parametrize( "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] ) @@ -190,14 +195,17 @@ def test_line_plot_period_mlt_series(self, frqncy): s = Series(np.random.randn(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) + @pytest.mark.slow def test_line_plot_datetime_series(self): for s in self.datetime_ser: _check_plot_works(s.plot, s.index.freq.rule_code) + @pytest.mark.slow def test_line_plot_period_frame(self): for df in self.period_df: _check_plot_works(df.plot, df.index.freq) + @pytest.mark.slow @pytest.mark.parametrize( "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] ) @@ -210,11 +218,13 @@ def test_line_plot_period_mlt_frame(self, frqncy): freq = df.index.asfreq(df.index.freq.rule_code).freq _check_plot_works(df.plot, freq) + @pytest.mark.slow def test_line_plot_datetime_frame(self): for df in self.datetime_df: freq = df.index.to_period(df.index.freq.rule_code).freq _check_plot_works(df.plot, freq) + @pytest.mark.slow def test_line_plot_inferred_freq(self): for ser in self.datetime_ser: ser = Series(ser.values, Index(np.asarray(ser.index))) @@ -231,6 +241,7 @@ def test_fake_inferred_business(self): ts.plot(ax=ax) assert not hasattr(ax, "freq") + @pytest.mark.slow def test_plot_offset_freq(self): ser = tm.makeTimeSeries() _check_plot_works(ser.plot) @@ -239,11 +250,13 @@ def test_plot_offset_freq(self): ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) + @pytest.mark.slow def test_plot_multiple_inferred_freq(self): dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)]) ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) + @pytest.mark.slow def test_uhf(self): import pandas.plotting._matplotlib.converter as conv @@ -262,6 +275,7 @@ def test_uhf(self): if len(rs): assert xp == rs + @pytest.mark.slow def test_irreg_hf(self): idx = date_range("2012-6-22 21:59:51", freq="S", periods=100) df = DataFrame(np.random.randn(len(idx), 2), index=idx) @@ -308,6 +322,7 @@ def test_business_freq(self): idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == "B" + @pytest.mark.slow def test_business_freq_convert(self): bts = tm.makeTimeSeries(300).asfreq("BM") ts = bts.to_period("M") @@ -345,6 +360,7 @@ def test_dataframe(self): idx = ax.get_lines()[0].get_xdata() tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) + @pytest.mark.slow def test_axis_limits(self): def _test(ax): xlim = ax.get_xlim() @@ -395,6 +411,7 @@ def test_get_finder(self): assert conv.get_finder(to_offset("A")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder + @pytest.mark.slow def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] @@ -417,6 +434,7 @@ def test_finder_daily(self): assert rs1 == xpl1 assert rs2 == xpl2 + @pytest.mark.slow def test_finder_quarterly(self): yrs = [3.5, 11] @@ -439,6 +457,7 @@ def test_finder_quarterly(self): assert rs1 == xpl1 assert rs2 == xpl2 + @pytest.mark.slow def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] @@ -471,6 +490,7 @@ def test_finder_monthly_long(self): xp = Period("1989Q1", "M").ordinal assert rs == xp + @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] xp = [Period(x, freq="A").ordinal for x in xp] @@ -486,6 +506,7 @@ def test_finder_annual(self): assert rs == xp + @pytest.mark.slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) @@ -510,6 +531,7 @@ def test_finder_hourly(self): assert rs == xp + @pytest.mark.slow def test_gaps(self): ts = tm.makeTimeSeries() ts[5:25] = np.nan @@ -564,6 +586,7 @@ def test_gaps(self): mask = data.mask assert mask[2:5, 1].all() + @pytest.mark.slow def test_gap_upsample(self): low = tm.makeTimeSeries() low[5:25] = np.nan @@ -586,6 +609,7 @@ def test_gap_upsample(self): mask = data.mask assert mask[5:25, 1].all() + @pytest.mark.slow def test_secondary_y(self): ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) @@ -614,6 +638,7 @@ def test_secondary_y(self): assert hasattr(ax2, "left_ax") assert not hasattr(ax2, "right_ax") + @pytest.mark.slow def test_secondary_y_ts(self): idx = date_range("1/1/2000", periods=10) ser = Series(np.random.randn(10), idx) @@ -639,6 +664,7 @@ def test_secondary_y_ts(self): ax2 = ser.plot(secondary_y=True) assert ax.get_yaxis().get_visible() + @pytest.mark.slow @td.skip_if_no_scipy def test_secondary_kde(self): @@ -650,6 +676,7 @@ def test_secondary_kde(self): axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == "right" + @pytest.mark.slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() @@ -657,6 +684,7 @@ def test_secondary_bar(self): axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == "right" + @pytest.mark.slow def test_secondary_frame(self): df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) axes = df.plot(secondary_y=["a", "c"], subplots=True) @@ -664,6 +692,7 @@ def test_secondary_frame(self): assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position assert axes[2].get_yaxis().get_ticks_position() == "right" + @pytest.mark.slow def test_secondary_bar_frame(self): df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True) @@ -693,6 +722,7 @@ def test_mixed_freq_regular_first(self): assert left <= pidx[0].ordinal assert right >= pidx[-1].ordinal + @pytest.mark.slow def test_mixed_freq_irregular_first(self): s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] @@ -723,6 +753,7 @@ def test_mixed_freq_regular_first_df(self): assert left <= pidx[0].ordinal assert right >= pidx[-1].ordinal + @pytest.mark.slow def test_mixed_freq_irregular_first_df(self): # GH 9852 s1 = tm.makeTimeSeries().to_frame() @@ -748,6 +779,7 @@ def test_mixed_freq_hf_first(self): for line in ax.get_lines(): assert PeriodIndex(data=line.get_xdata()).freq == "D" + @pytest.mark.slow def test_mixed_freq_alignment(self): ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") ts_data = np.random.randn(12) @@ -761,6 +793,7 @@ def test_mixed_freq_alignment(self): assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] + @pytest.mark.slow def test_mixed_freq_lf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") @@ -840,6 +873,7 @@ def test_nat_handling(self): assert s.index.min() <= Series(xdata).min() assert Series(xdata).max() <= s.index.max() + @pytest.mark.slow def test_to_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") idxl = date_range("1/1/1999", periods=12, freq="M") @@ -851,6 +885,7 @@ def test_to_weekly_resampling(self): for line in ax.get_lines(): assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq + @pytest.mark.slow def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") idxl = date_range("1/1/1999", periods=12, freq="M") @@ -874,6 +909,7 @@ def test_from_weekly_resampling(self): tm.assert_numpy_array_equal(xdata, expected_h) tm.close() + @pytest.mark.slow def test_from_resampling_area_line_mixed(self): idxh = date_range("1/1/1999", periods=52, freq="W") idxl = date_range("1/1/1999", periods=12, freq="M") @@ -965,6 +1001,7 @@ def test_from_resampling_area_line_mixed(self): expected_y += low[i].values tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y) + @pytest.mark.slow def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 idxh = date_range("2014-07-01 09:00", freq="S", periods=50) @@ -988,6 +1025,7 @@ def test_mixed_freq_second_millisecond(self): for line in ax.get_lines(): assert PeriodIndex(data=line.get_xdata()).freq == "L" + @pytest.mark.slow def test_irreg_dtypes(self): # date idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] @@ -1008,6 +1046,7 @@ def test_irreg_dtypes(self): _, ax = self.plt.subplots() _check_plot_works(df.plot, ax=ax) + @pytest.mark.slow def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() @@ -1032,6 +1071,7 @@ def test_time(self): xp = time(h, m, s).strftime("%H:%M") assert xp == rs + @pytest.mark.slow def test_time_change_xlim(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() @@ -1073,6 +1113,7 @@ def test_time_change_xlim(self): xp = time(h, m, s).strftime("%H:%M") assert xp == rs + @pytest.mark.slow def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() @@ -1104,6 +1145,7 @@ def test_time_musec(self): xp = time(h, m, s, us).strftime("%H:%M") assert xp == rs + @pytest.mark.slow def test_secondary_upsample(self): idxh = date_range("1/1/1999", periods=365, freq="D") idxl = date_range("1/1/1999", periods=12, freq="M") @@ -1119,6 +1161,7 @@ def test_secondary_upsample(self): for line in ax.left_ax.get_lines(): assert PeriodIndex(line.get_xdata()).freq == "D" + @pytest.mark.slow def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) @@ -1220,6 +1263,7 @@ def test_format_date_axis(self): if len(line.get_text()) > 0: assert line.get_rotation() == 30 + @pytest.mark.slow def test_ax_plot(self): x = date_range(start="2012-01-02", periods=10, freq="D") y = list(range(len(x))) @@ -1227,12 +1271,13 @@ def test_ax_plot(self): lines = ax.plot(x, y, label="Y") tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) + @pytest.mark.slow def test_mpl_nopandas(self): dates = [date(2008, 12, 31), date(2009, 1, 31)] values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} + kw = dict(fmt="-", lw=4) _, ax = self.plt.subplots() ax.plot_date([x.toordinal() for x in dates], values1, **kw) @@ -1245,6 +1290,7 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) + @pytest.mark.slow def test_irregular_ts_shared_ax_xlim(self): # GH 2960 from pandas.plotting._matplotlib.converter import DatetimeConverter @@ -1262,6 +1308,7 @@ def test_irregular_ts_shared_ax_xlim(self): assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + @pytest.mark.slow def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y index_1 = [1, 2, 3, 4] @@ -1278,6 +1325,7 @@ def test_secondary_y_non_ts_xlim(self): assert left_before >= left_after assert right_before < right_after + @pytest.mark.slow def test_secondary_y_regular_ts_xlim(self): # GH 3490 - regular-timeseries with secondary y index_1 = date_range(start="2000-01-01", periods=4, freq="D") @@ -1294,6 +1342,7 @@ def test_secondary_y_regular_ts_xlim(self): assert left_before >= left_after assert right_before < right_after + @pytest.mark.slow def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y rng = date_range("2000-01-01", periods=10000, freq="min") @@ -1309,6 +1358,7 @@ def test_secondary_y_mixed_freq_ts_xlim(self): assert left_before == left_after assert right_before == right_after + @pytest.mark.slow def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y from pandas.plotting._matplotlib.converter import DatetimeConverter @@ -1402,6 +1452,7 @@ def test_hist(self): _, ax = self.plt.subplots() ax.hist([x, x], weights=[w1, w2]) + @pytest.mark.slow def test_overlapping_datetime(self): # GB 6608 s1 = Series( diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index f73ceee577a18..7ed29507fe0f4 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -11,8 +11,6 @@ import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index f700b2934cd8c..ab0024559333e 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -9,8 +9,6 @@ import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): @@ -23,6 +21,7 @@ def setup_method(self, method): self.ts = tm.makeTimeSeries() self.ts.name = "ts" + @pytest.mark.slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) @@ -46,11 +45,13 @@ def test_hist_legacy(self): with pytest.raises(ValueError): self.ts.hist(by=self.ts.index, figure=fig) + @pytest.mark.slow def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] assert len(ax.patches) == 2 + @pytest.mark.slow def test_hist_layout(self): df = self.hist_df with pytest.raises(ValueError): @@ -59,6 +60,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.height.hist(layout=[1, 1]) + @pytest.mark.slow def test_hist_layout_with_by(self): df = self.hist_df @@ -96,6 +98,7 @@ def test_hist_layout_with_by(self): axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import gcf, subplot @@ -109,11 +112,13 @@ def test_hist_no_overlap(self): axes = fig.axes assert len(axes) == 2 + @pytest.mark.slow def test_hist_by_no_extra_plots(self): df = self.hist_df axes = df.height.hist(by=df.gender) # noqa assert len(self.plt.get_fignums()) == 1 + @pytest.mark.slow def test_plot_fails_when_ax_differs_from_figure(self): from pylab import figure @@ -165,6 +170,7 @@ def test_hist_with_legend_raises(self, by): @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): + @pytest.mark.slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle @@ -250,6 +256,7 @@ def test_hist_df_legacy(self): with pytest.raises(AttributeError): ser.hist(foo="bar") + @pytest.mark.slow def test_hist_non_numerical_or_datetime_raises(self): # gh-10444, GH32590 df = DataFrame( @@ -275,6 +282,7 @@ def test_hist_non_numerical_or_datetime_raises(self): with pytest.raises(ValueError, match=msg): df_o.hist() + @pytest.mark.slow def test_hist_layout(self): df = DataFrame(np.random.randn(100, 2)) df[2] = to_datetime( @@ -313,6 +321,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.hist(layout=(-1, -1)) + @pytest.mark.slow # GH 9351 def test_tight_layout(self): df = DataFrame(np.random.randn(100, 2)) @@ -435,6 +444,7 @@ def test_hist_with_legend_raises(self, by, column): @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): + @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle @@ -504,6 +514,7 @@ def test_grouped_hist_legacy(self): with pytest.raises(ValueError, match=msg): df.hist(by="C", figsize="default") + @pytest.mark.slow def test_grouped_hist_legacy2(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) @@ -517,6 +528,7 @@ def test_grouped_hist_legacy2(self): assert len(self.plt.get_fignums()) == 2 tm.close() + @pytest.mark.slow def test_grouped_hist_layout(self): df = self.hist_df msg = "Layout of 1x1 must be larger than required size 2" @@ -571,6 +583,7 @@ def test_grouped_hist_layout(self): axes = df.hist(column=["height", "weight", "category"]) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + @pytest.mark.slow def test_grouped_hist_multiple_axes(self): # GH 6970, GH 7069 df = self.hist_df @@ -590,6 +603,7 @@ def test_grouped_hist_multiple_axes(self): # pass different number of axes from required axes = df.hist(column="height", ax=axes) + @pytest.mark.slow def test_axis_share_x(self): df = self.hist_df # GH4089 @@ -603,6 +617,7 @@ def test_axis_share_x(self): assert not ax1._shared_y_axes.joined(ax1, ax2) assert not ax2._shared_y_axes.joined(ax1, ax2) + @pytest.mark.slow def test_axis_share_y(self): df = self.hist_df ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True) @@ -615,6 +630,7 @@ def test_axis_share_y(self): assert not ax1._shared_x_axes.joined(ax1, ax2) assert not ax2._shared_x_axes.joined(ax1, ax2) + @pytest.mark.slow def test_axis_share_xy(self): df = self.hist_df ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 1208100ed2dce..f37d83cd0783e 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -11,8 +11,6 @@ import pandas.plotting as plotting -pytestmark = pytest.mark.slow - @td.skip_if_mpl def test_import_error_message(): @@ -68,6 +66,7 @@ def setup_method(self, method): self.ts = tm.makeTimeSeries() self.ts.name = "ts" + @pytest.mark.slow def test_autocorrelation_plot(self): from pandas.plotting import autocorrelation_plot @@ -77,12 +76,14 @@ def test_autocorrelation_plot(self): ax = autocorrelation_plot(self.ts, label="Test") self._check_legend_labels(ax, labels=["Test"]) + @pytest.mark.slow def test_lag_plot(self): from pandas.plotting import lag_plot _check_plot_works(lag_plot, series=self.ts) _check_plot_works(lag_plot, series=self.ts, lag=5) + @pytest.mark.slow def test_bootstrap_plot(self): from pandas.plotting import bootstrap_plot @@ -126,6 +127,7 @@ def test_scatter_matrix_axis(self): self._check_text_labels(axes0_labels, expected) self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + @pytest.mark.slow def test_andrews_curves(self, iris): from matplotlib import cm @@ -201,6 +203,7 @@ def test_andrews_curves(self, iris): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) + @pytest.mark.slow def test_parallel_coordinates(self, iris): from matplotlib import cm @@ -274,6 +277,7 @@ def test_parallel_coordinates_with_sorted_labels(self): # labels and colors are ordered strictly increasing assert prev[1] < nxt[1] and prev[0] < nxt[0] + @pytest.mark.slow def test_radviz(self, iris): from matplotlib import cm @@ -306,6 +310,7 @@ def test_radviz(self, iris): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors) + @pytest.mark.slow def test_subplot_titles(self, iris): df = iris.drop("Name", axis=1).head() # Use the column names as the subplot titles @@ -406,6 +411,7 @@ def test_get_standard_colors_no_appending(self): p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() + @pytest.mark.slow def test_dictionary_color(self): # issue-8193 # Test plot color dictionary format @@ -426,6 +432,7 @@ def test_dictionary_color(self): colors = [rect.get_color() for rect in ax.get_lines()[0:2]] assert all(color == expected[index] for index, color in enumerate(colors)) + @pytest.mark.slow def test_has_externally_shared_axis_x_axis(self): # GH33819 # Test _has_externally_shared_axis() works for x-axis @@ -451,6 +458,7 @@ def test_has_externally_shared_axis_x_axis(self): assert func(plots[0][2], "x") assert not func(plots[0][3], "x") + @pytest.mark.slow def test_has_externally_shared_axis_y_axis(self): # GH33819 # Test _has_externally_shared_axis() works for y-axis @@ -476,6 +484,7 @@ def test_has_externally_shared_axis_y_axis(self): assert func(plots[2][0], "y") assert not func(plots[3][0], "y") + @pytest.mark.slow def test_has_externally_shared_axis_invalid_compare_axis(self): # GH33819 # Test _has_externally_shared_axis() raises an exception when @@ -493,6 +502,7 @@ def test_has_externally_shared_axis_invalid_compare_axis(self): with pytest.raises(ValueError, match=msg): func(plots[0][0], "z") + @pytest.mark.slow def test_externally_shared_axes(self): # Example from GH33819 # Create data diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9da2336fb9342..b8dd2ada87506 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -16,8 +16,6 @@ import pandas.plotting as plotting -pytestmark = pytest.mark.slow - @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): @@ -36,6 +34,7 @@ def setup_method(self, method): self.iseries = tm.makePeriodSeries() self.iseries.name = "iseries" + @pytest.mark.slow def test_plot(self): _check_plot_works(self.ts.plot, label="foo") _check_plot_works(self.ts.plot, use_index=False) @@ -71,6 +70,7 @@ def test_plot(self): ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + @pytest.mark.slow def test_plot_figsize_and_title(self): # figsize and title _, ax = self.plt.subplots() @@ -222,6 +222,7 @@ def test_line_use_index_false(self): label2 = ax2.get_xlabel() assert label2 == "" + @pytest.mark.slow def test_bar_log(self): expected = np.array([1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]) @@ -255,6 +256,7 @@ def test_bar_log(self): tm.assert_almost_equal(res[1], ymax) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + @pytest.mark.slow def test_bar_ignore_index(self): df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) _, ax = self.plt.subplots() @@ -309,6 +311,7 @@ def test_unsorted_index_xlim(self): assert xmin <= np.nanmin(lines[0].get_data(orig=False)[0]) assert xmax >= np.nanmax(lines[0].get_data(orig=False)[0]) + @pytest.mark.slow def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. @@ -365,12 +368,14 @@ def test_pie_nan(self): result = [x.get_text() for x in ax.texts] assert result == expected + @pytest.mark.slow def test_hist_df_kwargs(self): df = DataFrame(np.random.randn(10, 2)) _, ax = self.plt.subplots() ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 10 + @pytest.mark.slow def test_hist_df_with_nonnumerics(self): # GH 9853 with tm.RNGContext(1): @@ -384,6 +389,7 @@ def test_hist_df_with_nonnumerics(self): ax = df.plot.hist(ax=ax) # bins=10 assert len(ax.patches) == 40 + @pytest.mark.slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) @@ -407,11 +413,13 @@ def test_hist_legacy(self): with pytest.raises(ValueError): self.ts.hist(by=self.ts.index, figure=fig) + @pytest.mark.slow def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] assert len(ax.patches) == 2 + @pytest.mark.slow def test_hist_layout(self): df = self.hist_df with pytest.raises(ValueError): @@ -420,6 +428,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.height.hist(layout=[1, 1]) + @pytest.mark.slow def test_hist_layout_with_by(self): df = self.hist_df @@ -455,6 +464,7 @@ def test_hist_layout_with_by(self): axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import gcf, subplot @@ -468,6 +478,7 @@ def test_hist_no_overlap(self): axes = fig.axes assert len(axes) == 2 + @pytest.mark.slow def test_hist_secondary_legend(self): # GH 9610 df = DataFrame(np.random.randn(30, 4), columns=list("abcd")) @@ -506,6 +517,7 @@ def test_hist_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() + @pytest.mark.slow def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame(np.random.randn(30, 3), columns=list("abc")) @@ -569,6 +581,7 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() + @pytest.mark.slow @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) @@ -584,12 +597,14 @@ def test_secondary_logy(self, input_logy, expected_scale): assert ax1.get_yscale() == expected_scale assert ax2.get_yscale() == expected_scale + @pytest.mark.slow def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.randn(2)) with pytest.raises(ValueError): _, ax = self.plt.subplots() x.plot(style="k--", color="k", ax=ax) + @pytest.mark.slow @td.skip_if_no_scipy def test_hist_kde(self): @@ -612,6 +627,7 @@ def test_hist_kde(self): ylabels = ax.get_yticklabels() self._check_text_labels(ylabels, [""] * len(ylabels)) + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_kwargs(self): sample_points = np.linspace(-100, 100, 20) @@ -625,6 +641,7 @@ def test_kde_kwargs(self): self._check_ax_scales(ax, yaxis="log") self._check_text_labels(ax.yaxis.get_label(), "Density") + @pytest.mark.slow @td.skip_if_no_scipy def test_kde_missing_vals(self): s = Series(np.random.uniform(size=50)) @@ -634,6 +651,7 @@ def test_kde_missing_vals(self): # gh-14821: check if the values have any missing values assert any(~np.isnan(axes.lines[0].get_xdata())) + @pytest.mark.slow def test_hist_kwargs(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(bins=5, ax=ax) @@ -650,6 +668,7 @@ def test_hist_kwargs(self): ax = self.ts.plot.hist(align="left", stacked=True, ax=ax) tm.close() + @pytest.mark.slow @td.skip_if_no_scipy def test_hist_kde_color(self): _, ax = self.plt.subplots() @@ -665,6 +684,7 @@ def test_hist_kde_color(self): assert len(lines) == 1 self._check_colors(lines, ["r"]) + @pytest.mark.slow def test_boxplot_series(self): _, ax = self.plt.subplots() ax = self.ts.plot.box(logy=True, ax=ax) @@ -674,6 +694,7 @@ def test_boxplot_series(self): ylabels = ax.get_yticklabels() self._check_text_labels(ylabels, [""] * len(ylabels)) + @pytest.mark.slow def test_kind_both_ways(self): s = Series(range(3)) kinds = ( @@ -687,6 +708,7 @@ def test_kind_both_ways(self): getattr(s.plot, kind)() self.plt.close() + @pytest.mark.slow def test_invalid_plot_data(self): s = Series(list("abcd")) _, ax = self.plt.subplots() @@ -696,6 +718,7 @@ def test_invalid_plot_data(self): with pytest.raises(TypeError, match=msg): s.plot(kind=kind, ax=ax) + @pytest.mark.slow def test_valid_object_plot(self): s = Series(range(10), dtype=object) for kind in plotting.PlotAccessor._common_kinds: @@ -715,6 +738,7 @@ def test_invalid_kind(self): with pytest.raises(ValueError): s.plot(kind="aasdf") + @pytest.mark.slow def test_dup_datetime_index_plot(self): dr1 = date_range("1/1/2009", periods=4) dr2 = date_range("1/2/2009", periods=4) @@ -743,6 +767,7 @@ def test_errorbar_asymmetrical(self): tm.close() + @pytest.mark.slow def test_errorbar_plot(self): s = Series(np.arange(10), name="x") @@ -788,6 +813,7 @@ def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) + @pytest.mark.slow def test_series_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( @@ -795,6 +821,7 @@ def test_series_grid_settings(self): plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds, ) + @pytest.mark.slow def test_standard_colors(self): from pandas.plotting._matplotlib.style import get_standard_colors @@ -811,6 +838,7 @@ def test_standard_colors(self): result = get_standard_colors(3, color=[c]) assert result == [c] * 3 + @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 3c48eeaccbf34..665bda15724fd 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -5,8 +5,6 @@ pytest.importorskip("matplotlib") from pandas.plotting._matplotlib.style import get_standard_colors -pytestmark = pytest.mark.slow - class TestGetStandardColors: @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index daa9ac531d556..3c9fab2d4090c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1273,7 +1273,7 @@ def test_resample_timegrouper(): dates3 = [pd.NaT] + dates1 + [pd.NaT] for dates in [dates1, dates2, dates3]: - df = DataFrame({"A": dates, "B": np.arange(len(dates))}) + df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) result = df.set_index("A").resample("M").count() exp_idx = DatetimeIndex( ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], @@ -1288,9 +1288,7 @@ def test_resample_timegrouper(): result = df.groupby(Grouper(freq="M", key="A")).count() tm.assert_frame_equal(result, expected) - df = DataFrame( - {"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))} - ) + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) result = df.set_index("A").resample("M").count() expected = DataFrame( {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, @@ -1730,7 +1728,7 @@ def test_resample_apply_product(): index = date_range(start="2012-01-31", freq="M", periods=12) ts = Series(range(12), index=index) - df = DataFrame({"A": ts, "B": ts + 2}) + df = DataFrame(dict(A=ts, B=ts + 2)) result = df.resample("Q").apply(np.product) expected = DataFrame( np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index c12111e20a4b1..50e7cf9bd8eda 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -158,12 +158,12 @@ def test_aggregate_normal(resample_method): @pytest.mark.parametrize( "method, method_args, unit", [ - ("sum", {}, 0), - ("sum", {"min_count": 0}, 0), - ("sum", {"min_count": 1}, np.nan), - ("prod", {}, 1), - ("prod", {"min_count": 0}, 1), - ("prod", {"min_count": 1}, np.nan), + ("sum", dict(), 0), + ("sum", dict(min_count=0), 0), + ("sum", dict(min_count=1), np.nan), + ("prod", dict(), 1), + ("prod", dict(min_count=0), 1), + ("prod", dict(min_count=1), np.nan), ], ) def test_resample_entirely_nat_window(method, method_args, unit): @@ -267,14 +267,14 @@ def test_repr(): @pytest.mark.parametrize( "method, method_args, expected_values", [ - ("sum", {}, [1, 0, 1]), - ("sum", {"min_count": 0}, [1, 0, 1]), - ("sum", {"min_count": 1}, [1, np.nan, 1]), - ("sum", {"min_count": 2}, [np.nan, np.nan, np.nan]), - ("prod", {}, [1, 1, 1]), - ("prod", {"min_count": 0}, [1, 1, 1]), - ("prod", {"min_count": 1}, [1, np.nan, 1]), - ("prod", {"min_count": 2}, [np.nan, np.nan, np.nan]), + ("sum", dict(), [1, 0, 1]), + ("sum", dict(min_count=0), [1, 0, 1]), + ("sum", dict(min_count=1), [1, np.nan, 1]), + ("sum", dict(min_count=2), [np.nan, np.nan, np.nan]), + ("prod", dict(), [1, 1, 1]), + ("prod", dict(min_count=0), [1, 1, 1]), + ("prod", dict(min_count=1), [1, np.nan, 1]), + ("prod", dict(min_count=2), [np.nan, np.nan, np.nan]), ], ) def test_upsample_sum(method, method_args, expected_values): diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 6dae28003d3b6..388575c5a3b86 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas.core.dtypes.dtypes import CategoricalDtype @@ -136,18 +137,13 @@ def test_categorical_index_preserver(self): ).set_index("B") tm.assert_frame_equal(result, expected) - # wrong categories -> uses concat_compat, which casts to object + # wrong categories df3 = DataFrame( {"A": a, "B": Categorical(b, categories=list("abe"))} ).set_index("B") - result = pd.concat([df2, df3]) - expected = pd.concat( - [ - df2.set_axis(df2.index.astype(object), 0), - df3.set_axis(df3.index.astype(object), 0), - ] - ) - tm.assert_frame_equal(result, expected) + msg = "categories must match existing categories when appending" + with pytest.raises(TypeError, match=msg): + pd.concat([df2, df3]) def test_concat_categorical_tz(self): # GH-23816 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 44a5e7f806309..a4d6b58307523 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -373,10 +373,10 @@ def test_concat_tz_series_with_datetimelike(self): def test_concat_tz_frame(self): df2 = DataFrame( - { - "A": Timestamp("20130102", tz="US/Eastern"), - "B": Timestamp("20130603", tz="CET"), - }, + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="CET"), + ), index=range(5), ) @@ -391,20 +391,20 @@ def test_concat_multiple_tzs(self): ts2 = Timestamp("2015-01-01", tz="UTC") ts3 = Timestamp("2015-01-01", tz="EST") - df1 = DataFrame({"time": [ts1]}) - df2 = DataFrame({"time": [ts2]}) - df3 = DataFrame({"time": [ts3]}) + df1 = DataFrame(dict(time=[ts1])) + df2 = DataFrame(dict(time=[ts2])) + df3 = DataFrame(dict(time=[ts3])) results = pd.concat([df1, df2]).reset_index(drop=True) - expected = DataFrame({"time": [ts1, ts2]}, dtype=object) + expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) tm.assert_frame_equal(results, expected) results = pd.concat([df1, df3]).reset_index(drop=True) - expected = DataFrame({"time": [ts1, ts3]}, dtype=object) + expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) tm.assert_frame_equal(results, expected) results = pd.concat([df2, df3]).reset_index(drop=True) - expected = DataFrame({"time": [ts2, ts3]}) + expected = DataFrame(dict(time=[ts2, ts3])) tm.assert_frame_equal(results, expected) def test_concat_multiindex_with_tz(self): diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index a97e9265b4f99..5c540124de8e6 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -26,7 +26,7 @@ def test_handle_empty_objects(self, sort): # empty as first element with time series # GH3259 df = DataFrame( - {"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s") + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") ) empty = DataFrame() result = concat([df, empty], axis=1) diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py index cc9f09c16fb43..3a886e0d612c6 100644 --- a/pandas/tests/reshape/concat/test_invalid.py +++ b/pandas/tests/reshape/concat/test_invalid.py @@ -12,7 +12,7 @@ def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) - for obj in [1, {}, [1, 2], (1, 2)]: + for obj in [1, dict(), [1, 2], (1, 2)]: msg = ( f"cannot concatenate object of type '{type(obj)}'; " @@ -45,7 +45,7 @@ def test_concat_invalid_first_argument(self): bar2,12,13,14,15 """ - with read_csv(StringIO(data), chunksize=1) as reader: - result = concat(reader, ignore_index=True) + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) expected = read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f43ae58fbcc2f..f44909b61ff7a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -422,10 +422,10 @@ def test_left_merge_empty_dataframe(self): @pytest.mark.parametrize( "kwarg", [ - {"left_index": True, "right_index": True}, - {"left_index": True, "right_on": "x"}, - {"left_on": "a", "right_index": True}, - {"left_on": "a", "right_on": "x"}, + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), ], ) def test_merge_left_empty_right_empty(self, join_type, kwarg): @@ -475,18 +475,18 @@ def check2(exp, kwarg): tm.assert_frame_equal(result, exp) for kwarg in [ - {"left_index": True, "right_index": True}, - {"left_index": True, "right_on": "x"}, + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), ]: check1(exp_in, kwarg) check2(exp_out, kwarg) - kwarg = {"left_on": "a", "right_index": True} + kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) exp_out["a"] = [0, 1, 2] check2(exp_out, kwarg) - kwarg = {"left_on": "a", "right_on": "x"} + kwarg = dict(left_on="a", right_on="x") check1(exp_in, kwarg) exp_out["a"] = np.array([np.nan] * 3, dtype=object) check2(exp_out, kwarg) @@ -524,10 +524,10 @@ def check2(exp, kwarg): tm.assert_frame_equal(result, exp) for kwarg in [ - {"left_index": True, "right_index": True}, - {"left_index": True, "right_on": "x"}, - {"left_on": "a", "right_index": True}, - {"left_on": "a", "right_on": "x"}, + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), ]: check1(exp_in, kwarg) check2(exp_out, kwarg) @@ -753,7 +753,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1999,19 +1999,19 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): @pytest.mark.parametrize( "col1, col2, kwargs, expected_cols", [ - (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]), - (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]), - (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]), - (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]), - ("a", 0, {"suffixes": (None, "_y")}, ["a", 0]), - (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]), - ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]), - ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]), - ("a", "b", {"suffixes": ("_x", None)}, ["a", "b"]), - ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]), - (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]), - ("a", "a", {}, ["a_x", "a_y"]), - (0, 0, {}, ["0_x", "0_y"]), + (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), + (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), + (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + (0, 0, dict(suffixes=["_x", "_y"]), ["0_x", "0_y"]), + ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), + (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), + ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), + ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), + ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), + ("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]), + (0, 0, dict(suffixes=("_a", None)), ["0_a", 0]), + ("a", "a", dict(), ["a_x", "a_y"]), + (0, 0, dict(), ["0_x", "0_y"]), ], ) def test_merge_suffix(col1, col2, kwargs, expected_cols): diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index c3e0a92850c07..d20d93370ec7e 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -8,22 +8,22 @@ @pytest.fixture def df1(): return DataFrame( - { - "outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - "inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - "v1": np.linspace(0, 1, 11), - } + dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11), + ) ) @pytest.fixture def df2(): return DataFrame( - { - "outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - "inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - "v2": np.linspace(10, 11, 12), - } + dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12), + ) ) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 4a70719df5c57..17f2f44f45fce 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -115,84 +115,3 @@ def test_doc_example(self): ) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "left, right, on, left_by, right_by, expected", - [ - ( - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), - DataFrame({"T": [2], "E": [1]}), - ["T"], - ["G", "H"], - None, - DataFrame( - { - "G": ["g"] * 3, - "H": ["h"] * 3, - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - } - ), - ), - ( - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), - DataFrame({"T": [2], "E": [1]}), - "T", - ["G", "H"], - None, - DataFrame( - { - "G": ["g"] * 3, - "H": ["h"] * 3, - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - } - ), - ), - ( - DataFrame({"T": [2], "E": [1]}), - DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), - ["T"], - None, - ["G", "H"], - DataFrame( - { - "T": [1, 2, 3], - "E": [np.nan, 1.0, np.nan], - "G": ["g"] * 3, - "H": ["h"] * 3, - } - ), - ), - ], - ) - def test_list_type_by(self, left, right, on, left_by, right_by, expected): - # GH 35269 - result = merge_ordered( - left=left, - right=right, - on=on, - left_by=left_by, - right_by=right_by, - ) - - tm.assert_frame_equal(result, expected) - - def test_left_by_length_equals_to_right_shape0(self): - # GH 38166 - left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) - right = DataFrame([[2, 1]], columns=list("TE")) - result = merge_ordered(left, right, on="T", left_by=["G", "H"]) - expected = DataFrame( - {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]} - ) - - tm.assert_frame_equal(result, expected) - - def test_elements_not_in_by_but_in_df(self): - # GH 38167 - left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT")) - right = DataFrame([[2, 1]], columns=list("TE")) - msg = r"\{'h'\} not found in left columns" - with pytest.raises(KeyError, match=msg): - merge_ordered(left, right, on="T", left_by=["G", "h"]) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 673c97740594f..260a0e9d486b2 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -36,13 +36,13 @@ def right(): @pytest.fixture def left_multi(): return DataFrame( - { - "Origin": ["A", "A", "B", "B", "C"], - "Destination": ["A", "B", "A", "C", "A"], - "Period": ["AM", "AM", "IP", "AM", "OP"], - "TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"], - "Trips": [1987, 3647, 2470, 4296, 4444], - }, + dict( + Origin=["A", "A", "B", "B", "C"], + Destination=["A", "B", "A", "C", "A"], + Period=["AM", "AM", "IP", "AM", "OP"], + TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"], + Trips=[1987, 3647, 2470, 4296, 4444], + ), columns=["Origin", "Destination", "Period", "TripPurp", "Trips"], ).set_index(["Origin", "Destination", "Period", "TripPurp"]) @@ -50,13 +50,13 @@ def left_multi(): @pytest.fixture def right_multi(): return DataFrame( - { - "Origin": ["A", "A", "B", "B", "C", "C", "E"], - "Destination": ["A", "B", "A", "B", "A", "B", "F"], - "Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"], - "LinkType": ["a", "b", "c", "b", "a", "b", "a"], - "Distance": [100, 80, 90, 80, 75, 35, 55], - }, + dict( + Origin=["A", "A", "B", "B", "C", "C", "E"], + Destination=["A", "B", "A", "B", "A", "B", "F"], + Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"], + LinkType=["a", "b", "c", "b", "a", "b", "a"], + Distance=[100, 80, 90, 80, 75, 35, 55], + ), columns=["Origin", "Destination", "Period", "LinkType", "Distance"], ).set_index(["Origin", "Destination", "Period", "LinkType"]) @@ -533,17 +533,17 @@ def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = DataFrame( - { - "household_id": [1, 2, 3], - "male": [0, 1, 0], - "wealth": [196087.3, 316478.7, 294750], - }, + dict( + household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750], + ), columns=["household_id", "male", "wealth"], ).set_index("household_id") portfolio = DataFrame( - { - "household_id": [1, 2, 2, 3, 3, 3, 4], - "asset_id": [ + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", @@ -552,7 +552,7 @@ def test_join_multi_levels(self): "nl0000289965", np.nan, ], - "name": [ + name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", @@ -561,24 +561,17 @@ def test_join_multi_levels(self): "Postbank BioTech Fonds", np.nan, ], - "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], - }, + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) result = household.join(portfolio, how="inner") expected = ( DataFrame( - { - "male": [0, 1, 1, 0, 0, 0], - "wealth": [ - 196087.3, - 316478.7, - 316478.7, - 294750.0, - 294750.0, - 294750.0, - ], - "name": [ + dict( + male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], + name=[ "ABN Amro", "Robeco", "Royal Dutch Shell", @@ -586,9 +579,9 @@ def test_join_multi_levels(self): "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", ], - "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25], - "household_id": [1, 2, 2, 3, 3, 3], - "asset_id": [ + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=[ "nl0000301109", "nl0000289783", "gb00b03mlx29", @@ -596,7 +589,7 @@ def test_join_multi_levels(self): "lu0197800237", "nl0000289965", ], - } + ) ) .set_index(["household_id", "asset_id"]) .reindex(columns=["male", "wealth", "name", "share"]) @@ -618,7 +611,7 @@ def test_join_multi_levels(self): expected, ( DataFrame( - {"share": [1.00]}, + dict(share=[1.00]), index=MultiIndex.from_tuples( [(4, np.nan)], names=["household_id", "asset_id"] ), @@ -649,9 +642,9 @@ def test_join_multi_levels2(self): # some more advanced merges # GH6360 household = DataFrame( - { - "household_id": [1, 2, 2, 3, 3, 3, 4], - "asset_id": [ + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ "nl0000301109", "nl0000301109", "gb00b03mlx29", @@ -660,36 +653,30 @@ def test_join_multi_levels2(self): "nl0000289965", np.nan, ], - "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], - }, + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), columns=["household_id", "asset_id", "share"], ).set_index(["household_id", "asset_id"]) log_return = DataFrame( - { - "asset_id": [ + dict( + asset_id=[ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", ], - "t": [233, 234, 235, 180, 181], - "log_return": [ - 0.09604978, - -0.06524096, - 0.03532373, - 0.03025441, - 0.036997, - ], - } + t=[233, 234, 235, 180, 181], + log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997], + ) ).set_index(["asset_id", "t"]) expected = ( DataFrame( - { - "household_id": [2, 2, 2, 3, 3, 3, 3, 3], - "asset_id": [ + dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=[ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", @@ -699,9 +686,9 @@ def test_join_multi_levels2(self): "lu0197800237", "lu0197800237", ], - "t": [233, 234, 235, 233, 234, 235, 180, 181], - "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - "log_return": [ + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[ 0.09604978, -0.06524096, 0.03532373, @@ -711,7 +698,7 @@ def test_join_multi_levels2(self): 0.03025441, 0.036997, ], - } + ) ) .set_index(["household_id", "asset_id", "t"]) .reindex(columns=["share", "log_return"]) @@ -728,9 +715,9 @@ def test_join_multi_levels2(self): expected = ( DataFrame( - { - "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - "asset_id": [ + dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=[ "nl0000301109", "nl0000301109", "gb00b03mlx29", @@ -744,21 +731,8 @@ def test_join_multi_levels2(self): "nl0000289965", None, ], - "t": [ - None, - None, - 233, - 234, - 235, - 233, - 234, - 235, - 180, - 181, - None, - None, - ], - "share": [ + t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share=[ 1.0, 0.4, 0.6, @@ -772,7 +746,7 @@ def test_join_multi_levels2(self): 0.25, 1.0, ], - "log_return": [ + log_return=[ None, None, 0.09604978, @@ -786,7 +760,7 @@ def test_join_multi_levels2(self): None, None, ], - } + ) ) .set_index(["household_id", "asset_id", "t"]) .reindex(columns=["share", "log_return"]) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 6faf64789c687..5f6037276b31c 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -535,32 +535,15 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_duplicate_names(self): - # GH 13279 / 22529 - - s1 = Series(range(3), name="foo") - s2_foo = Series(range(1, 4), name="foo") - s2_bar = Series(range(1, 4), name="bar") - s3 = Series(range(3), name="waldo") - - # check result computed with duplicate labels against - # result computed with unique labels, then relabelled - mapper = {"bar": "foo"} - - # duplicate row, column labels - result = crosstab(s1, s2_foo) - expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1) - tm.assert_frame_equal(result, expected) - - # duplicate row, unique column labels - result = crosstab([s1, s2_foo], s3) - expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0) - tm.assert_frame_equal(result, expected) - - # unique row, duplicate column labels - result = crosstab(s3, [s1, s2_foo]) - expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1) + def test_crosstab_dup_index_names(self): + # GH 13279 + s = Series(range(3), name="foo") + result = crosstab(s, s) + expected_index = Index(range(3), name="foo") + expected = DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 4786b8c35a5b1..8aa4012b3e77c 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -377,10 +377,10 @@ def test_series_ret_bins(): @pytest.mark.parametrize( "kwargs,msg", [ - ({"duplicates": "drop"}, None), - ({}, "Bin edges must be unique"), - ({"duplicates": "raise"}, "Bin edges must be unique"), - ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"), + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), ], ) def test_cut_duplicates_bin(kwargs, msg): diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index e7a04bafed8e3..c436ab5d90578 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -166,10 +166,10 @@ def test_qcut_list_like_labels(labels, expected): @pytest.mark.parametrize( "kwargs,msg", [ - ({"duplicates": "drop"}, None), - ({}, "Bin edges must be unique"), - ({"duplicates": "raise"}, "Bin edges must be unique"), - ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"), + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), ], ) def test_qcut_duplicates_bin(kwargs, msg): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 9b87e32510b41..bce42f8c6caf0 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -487,22 +487,6 @@ def test_period_cons_combined(self): with pytest.raises(ValueError, match=msg): Period("2011-01", freq="1D1W") - @pytest.mark.parametrize("day", ["1970/01/01 ", "2020-12-31 ", "1981/09/13 "]) - @pytest.mark.parametrize("hour", ["00:00:00", "00:00:01", "23:59:59", "12:00:59"]) - @pytest.mark.parametrize( - "sec_float, expected", - [ - (".000000001", 1), - (".000000999", 999), - (".123456789", 789), - (".999999999", 999), - ], - ) - def test_period_constructor_nanosecond(self, day, hour, sec_float, expected): - # GH 34621 - - assert Period(day + hour + sec_float).start_time.nanosecond == expected - @pytest.mark.parametrize("hour", range(24)) def test_period_large_ordinal(self, hour): # Issue #36430 diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index a15ef11f9c292..71ddf72562f36 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -561,7 +561,7 @@ def test_indexing(): expected = ts["2001"] expected.name = "A" - df = DataFrame({"A": ts}) + df = DataFrame(dict(A=ts)) with tm.assert_produces_warning(FutureWarning): # GH#36179 string indexing on rows for DataFrame deprecated result = df["2001"]["A"] diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index b4c30cb6d4cd2..3686337141420 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -389,22 +389,10 @@ def test_getitem_generator(string_series): tm.assert_series_equal(result2, expected) -@pytest.mark.parametrize( - "series", - [ - Series([0, 1]), - Series(date_range("2012-01-01", periods=2)), - Series(date_range("2012-01-01", periods=2, tz="CET")), - ], -) -def test_getitem_ndim_deprecated(series): - with tm.assert_produces_warning( - FutureWarning, match="Support for multi-dimensional indexing" - ): - result = series[:, None] - - expected = np.asarray(series)[:, None] - tm.assert_numpy_array_equal(result, expected) +def test_getitem_ndim_deprecated(): + s = Series([0, 1]) + with tm.assert_produces_warning(FutureWarning): + s[:, None] def test_getitem_multilevel_scalar_slice_not_implemented( diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 159b42621f970..682c057f05700 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -309,7 +309,8 @@ def test_loc_setitem_2d_to_1d_raises(): msg = "|".join( [ - r"shape mismatch: value array of shape \(2,2\)", + r"shape mismatch: value array of shape \(2,2\) could not be " + r"broadcast to indexing result of shape \(2,\)", r"cannot reshape array of size 4 into shape \(2,\)", ] ) @@ -666,9 +667,7 @@ def test_underlying_data_conversion(): df df["val"].update(s) - expected = DataFrame( - {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} - ) + expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) return_value = expected.set_index(["a", "b", "c"], inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @@ -691,11 +690,11 @@ def test_underlying_data_conversion(): pd.set_option("chained_assignment", "raise") # GH 3217 - df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) + df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) df["c"] = np.nan df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan])) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/indexing/test_xs.py b/pandas/tests/series/indexing/test_xs.py index 83cc6d4670423..ca7ed50ab8875 100644 --- a/pandas/tests/series/indexing/test_xs.py +++ b/pandas/tests/series/indexing/test_xs.py @@ -56,8 +56,8 @@ def test_series_xs_droplevel_false(self): mi = MultiIndex.from_tuples( [("a", "x"), ("a", "y"), ("b", "x")], names=["level1", "level2"] ) - ser = Series([1, 1, 1], index=mi) - result = ser.xs("a", axis=0, drop_level=False) + df = Series([1, 1, 1], index=mi) + result = df.xs("a", axis=0, drop_level=False) expected = Series( [1, 1], index=MultiIndex.from_tuples( diff --git a/pandas/tests/series/methods/test_convert.py b/pandas/tests/series/methods/test_convert.py index f052f4423d32a..b213e4a6c4c8a 100644 --- a/pandas/tests/series/methods/test_convert.py +++ b/pandas/tests/series/methods/test_convert.py @@ -3,23 +3,45 @@ import numpy as np import pytest -from pandas import Series, Timestamp +from pandas import NaT, Series, Timestamp import pandas._testing as tm class TestConvert: def test_convert(self): # GH#10265 + # Tests: All to nans, coerce, true + # Test coercion returns correct type + ser = Series(["a", "b", "c"]) + results = ser._convert(datetime=True, coerce=True) + expected = Series([NaT] * 3) + tm.assert_series_equal(results, expected) + + results = ser._convert(numeric=True, coerce=True) + expected = Series([np.nan] * 3) + tm.assert_series_equal(results, expected) + + expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) + results = ser._convert(timedelta=True, coerce=True) + tm.assert_series_equal(results, expected) + dt = datetime(2001, 1, 1, 0, 0) td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types ser = Series(["a", "3.1415", dt, td]) + results = ser._convert(datetime=True, coerce=True) + expected = Series([NaT, NaT, dt, NaT]) + tm.assert_series_equal(results, expected) - results = ser._convert(numeric=True) + results = ser._convert(numeric=True, coerce=True) expected = Series([np.nan, 3.1415, np.nan, np.nan]) tm.assert_series_equal(results, expected) + results = ser._convert(timedelta=True, coerce=True) + expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) + tm.assert_series_equal(results, expected) + # Test standard conversion returns original results = ser._convert(datetime=True) tm.assert_series_equal(results, ser) @@ -94,6 +116,19 @@ def test_convert(self): datetime(2001, 1, 3, 0, 0), ] ) + s2 = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + "foo", + 1.0, + 1, + Timestamp("20010104"), + "20010105", + ], + dtype="O", + ) result = ser._convert(datetime=True) expected = Series( @@ -102,12 +137,35 @@ def test_convert(self): ) tm.assert_series_equal(result, expected) - result = ser._convert(datetime=True) + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + expected = Series( + [ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20010103"), + NaT, + NaT, + NaT, + Timestamp("20010104"), + Timestamp("20010105"), + ], + dtype="M8[ns]", + ) + result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) + tm.assert_series_equal(result, expected) + result = s2._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + ser = Series(["foo", "bar", 1, 1.0], dtype="O") + result = ser._convert(datetime=True, coerce=True) + expected = Series([NaT] * 2 + [Timestamp(1)] * 2) tm.assert_series_equal(result, expected) # preserver if non-object ser = Series([1], dtype="float32") - result = ser._convert(datetime=True) + result = ser._convert(datetime=True, coerce=True) tm.assert_series_equal(result, ser) # FIXME: dont leave commented-out @@ -116,6 +174,16 @@ def test_convert(self): # result = res._convert(convert_dates=True,convert_numeric=False) # assert result.dtype == 'M8[ns]' + # dateutil parses some single letters into today's value as a date + expected = Series([NaT]) + for x in "abcdefghijklmnopqrstuvwxyz": + ser = Series([x]) + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + ser = Series([x.upper()]) + result = ser._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + def test_convert_no_arg_error(self): ser = Series(["1.0", "2"]) msg = r"At least one of datetime, numeric or timedelta must be True\." diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 920182a99e9ef..8a915324a72c1 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -8,186 +8,272 @@ import pandas as pd import pandas._testing as tm -# Each test case consists of a tuple with the data and dtype to create the -# test Series, the default dtype for the expected result (which is valid -# for most cases), and the specific cases where the result deviates from -# this default. Those overrides are defined as a dict with (keyword, val) as -# dictionary key. In case of multiple items, the last override takes precendence. -test_cases = [ - ( - # data - [1, 2, 3], - # original dtype - np.dtype("int32"), - # default expected dtype - "Int32", - # exceptions on expected dtype - {("convert_integer", False): np.dtype("int32")}, - ), - ( - [1, 2, 3], - np.dtype("int64"), - "Int64", - {("convert_integer", False): np.dtype("int64")}, - ), - ( - ["x", "y", "z"], - np.dtype("O"), - pd.StringDtype(), - {("convert_string", False): np.dtype("O")}, - ), - ( - [True, False, np.nan], - np.dtype("O"), - pd.BooleanDtype(), - {("convert_boolean", False): np.dtype("O")}, - ), - ( - ["h", "i", np.nan], - np.dtype("O"), - pd.StringDtype(), - {("convert_string", False): np.dtype("O")}, - ), - ( # GH32117 - ["h", "i", 1], - np.dtype("O"), - np.dtype("O"), - {}, - ), - ( - [10, np.nan, 20], - np.dtype("float"), - "Int64", - { - ("convert_integer", False, "convert_floating", True): "Float64", - ("convert_integer", False, "convert_floating", False): np.dtype("float"), - }, - ), - ( - [np.nan, 100.5, 200], - np.dtype("float"), - "Float64", - {("convert_floating", False): np.dtype("float")}, - ), - ( - [3, 4, 5], - "Int8", - "Int8", - {}, - ), - ( - [[1, 2], [3, 4], [5]], - None, - np.dtype("O"), - {}, - ), - ( - [4, 5, 6], - np.dtype("uint32"), - "UInt32", - {("convert_integer", False): np.dtype("uint32")}, - ), - ( - [-10, 12, 13], - np.dtype("i1"), - "Int8", - {("convert_integer", False): np.dtype("i1")}, - ), - ( - [1.2, 1.3], - np.dtype("float32"), - "Float32", - {("convert_floating", False): np.dtype("float32")}, - ), - ( - [1, 2.0], - object, - "Int64", - { - ("convert_integer", False): "Float64", - ("convert_integer", False, "convert_floating", False): np.dtype("float"), - ("infer_objects", False): np.dtype("object"), - }, - ), - ( - [1, 2.5], - object, - "Float64", - { - ("convert_floating", False): np.dtype("float"), - ("infer_objects", False): np.dtype("object"), - }, - ), - (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - "datetime64[ns]", - np.dtype("datetime64[ns]"), - {}, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - object, - np.dtype("datetime64[ns]"), - {("infer_objects", False): np.dtype("object")}, - ), - (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}), - ( - pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), - None, - pd.IntervalDtype("int64"), - {}, - ), -] - class TestSeriesConvertDtypes: + # The answerdict has keys that have 4 tuples, corresponding to the arguments + # infer_objects, convert_string, convert_integer, convert_boolean + # This allows all 16 possible combinations to be tested. Since common + # combinations expect the same answer, this provides an easy way to list + # all the possibilities @pytest.mark.parametrize( - "data, maindtype, expected_default, expected_other", - test_cases, + "data, maindtype, answerdict", + [ + ( + [1, 2, 3], + np.dtype("int32"), + { + ((True, False), (True, False), (True,), (True, False)): "Int32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int32" + ), + }, + ), + ( + [1, 2, 3], + np.dtype("int64"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int64" + ), + }, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [True, False, np.nan], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True,), + ): pd.BooleanDtype(), + ((True, False), (True, False), (True, False), (False,)): np.dtype( + "O" + ), + }, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( # GH32117 + ["h", "i", 1], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("float"), + }, + ), + ( + [3, 4, 5], + "Int8", + {((True, False), (True, False), (True, False), (True, False)): "Int8"}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + { + ((True, False), (True, False), (True,), (True, False)): "UInt32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "uint32" + ), + }, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + { + ((True, False), (True, False), (True,), (True, False)): "Int8", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "i1" + ), + }, + ), + ( + [1, 2.0], + object, + { + ((True,), (True, False), (True,), (True, False)): "Int64", + ((True,), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (True, False), (True, False)): np.dtype( + "object" + ), + }, + ), + ( + [1, 2.5], + object, + { + ((True,), (True, False), (True, False), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (True, False), (True, False)): np.dtype( + "object" + ), + }, + ), + ( + ["a", "b"], + pd.CategoricalDtype(), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.CategoricalDtype(), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.DatetimeTZDtype(tz="UTC"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("datetime64[ns]"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + { + ((True,), (True, False), (True, False), (True, False)): np.dtype( + "datetime64[ns]" + ), + ((False,), (True, False), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.PeriodDtype("M"), + }, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.IntervalDtype("int64"), + }, + ), + ], ) - @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) - def test_convert_dtypes( - self, data, maindtype, params, expected_default, expected_other - ): + @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) + answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} - result = series.convert_dtypes(*params) - - param_names = [ - "infer_objects", - "convert_string", - "convert_integer", - "convert_boolean", - "convert_floating", - ] - params_dict = dict(zip(param_names, params)) - - expected_dtype = expected_default - for spec, dtype in expected_other.items(): - if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): - expected_dtype = dtype - - expected = pd.Series(data, dtype=expected_dtype) - tm.assert_series_equal(result, expected) + ns = series.convert_dtypes(*params) + expected_dtype = answers[tuple(params)] + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) - if is_interval_dtype(result.dtype) and result.dtype.subtype.kind in ["i", "u"]: + if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): - result[result.notna()] = np.nan + ns[ns.notna()] = np.nan else: - result[result.notna()] = np.nan + ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 8740a309eec13..1b05f72f5cf4d 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -37,7 +37,7 @@ def nontemporal_method(request): separately from these non-temporal methods. """ method = request.param - kwargs = {"order": 1} if method in ("spline", "polynomial") else {} + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs @@ -67,7 +67,7 @@ def interp_methods_ind(request): 'values' as a parameterization """ method = request.param - kwargs = {"order": 1} if method in ("spline", "polynomial") else {} + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs @@ -458,82 +458,6 @@ def test_interp_limit_direction_raises(self, method, limit_direction, expected): with pytest.raises(ValueError, match=msg): s.interpolate(method=method, limit_direction=limit_direction) - @pytest.mark.parametrize( - "data, expected_data, kwargs", - ( - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], - {"method": "pad", "limit_area": "inside"}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], - {"method": "pad", "limit_area": "inside", "limit": 1}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], - {"method": "pad", "limit_area": "outside"}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], - {"method": "pad", "limit_area": "outside", "limit": 1}, - ), - ( - [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - {"method": "pad", "limit_area": "outside", "limit": 1}, - ), - ( - range(5), - range(5), - {"method": "pad", "limit_area": "outside", "limit": 1}, - ), - ), - ) - def test_interp_limit_area_with_pad(self, data, expected_data, kwargs): - # GH26796 - - s = Series(data) - expected = Series(expected_data) - result = s.interpolate(**kwargs) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "data, expected_data, kwargs", - ( - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], - {"method": "bfill", "limit_area": "inside"}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], - {"method": "bfill", "limit_area": "inside", "limit": 1}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], - {"method": "bfill", "limit_area": "outside"}, - ), - ( - [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], - [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], - {"method": "bfill", "limit_area": "outside", "limit": 1}, - ), - ), - ) - def test_interp_limit_area_with_backfill(self, data, expected_data, kwargs): - # GH26796 - - s = Series(data) - expected = Series(expected_data) - result = s.interpolate(**kwargs) - tm.assert_series_equal(result, expected) - def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b204d92b9122f..c5196cea5d3bb 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -756,15 +756,12 @@ def test_align_date_objects_with_datetimeindex(self): ) @pytest.mark.parametrize("box", [list, tuple, np.array, pd.Index, pd.Series, pd.array]) @pytest.mark.parametrize("flex", [True, False]) -def test_series_ops_name_retention(flex, box, names, all_binary_operators, request): +def test_series_ops_name_retention(flex, box, names, all_binary_operators): # GH#33930 consistent name retention op = all_binary_operators - if op is ops.rfloordiv and box in [list, tuple] and not flex: - mark = pytest.mark.xfail( - reason="op fails because of inconsistent ndarray-wrapping GH#28759" - ) - request.node.add_marker(mark) + if op is ops.rfloordiv and box in [list, tuple]: + pytest.xfail("op fails because of inconsistent ndarray-wrapping GH#28759") left = Series(range(10), name=names[0]) right = Series(range(10), name=names[1]) @@ -841,8 +838,14 @@ class TestInplaceOperations: ( ("Int64", "Int64", "Int64", "Int64"), ("float", "float", "float", "float"), - ("Int64", "float", "Float64", "Float64"), - ("Int64", "Float64", "Float64", "Float64"), + ("Int64", "float", "float", "float"), + pytest.param( + "Int64", + "Float64", + "Float64", + "Float64", + marks=pytest.mark.xfail(reason="Not implemented yet"), + ), ), ) def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 35411d7e9cfb7..d836ca7a53249 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1044,6 +1044,7 @@ def test_different_nans_as_float64(self): expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_int_df_string_search(self): """Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1""" @@ -1052,6 +1053,7 @@ def test_isin_int_df_string_search(self): expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) + @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") -> should not match values because np.nan is not equal str NaN""" @@ -1060,6 +1062,7 @@ def test_isin_nan_df_string_search(self): expected_false = DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) + @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_float_df_string_search(self): """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") -> should not match values because float 1.4245 is not equal str 1.4245""" diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index f89958f7723ef..713607d087bc0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -48,8 +48,8 @@ def transform_assert_equal(request): @pytest.mark.parametrize( "input_kwargs,result_kwargs", [ - ({}, {"dtype": np.int64}), - ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}), + (dict(), dict(dtype=np.int64)), + (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)), ], ) def test_empty(input_kwargs, result_kwargs): @@ -147,10 +147,10 @@ def test_list(): @pytest.mark.parametrize( "data,arr_kwargs", [ - ([1, 3, 4, 5], {"dtype": np.int64}), - ([1.0, 3.0, 4.0, 5.0], {}), + ([1, 3, 4, 5], dict(dtype=np.int64)), + ([1.0, 3.0, 4.0, 5.0], dict()), # Boolean is regarded as numeric. - ([True, False, True, True], {}), + ([True, False, True, True], dict()), ], ) def test_list_numeric(data, arr_kwargs): @@ -159,7 +159,7 @@ def test_list_numeric(data, arr_kwargs): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}]) +@pytest.mark.parametrize("kwargs", [dict(dtype="O"), dict()]) def test_numeric(kwargs): data = [1, -3.14, 7] @@ -182,13 +182,13 @@ def test_numeric(kwargs): def test_numeric_df_columns(columns): # see gh-14827 df = DataFrame( - { - "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], - "b": [1.0, 2.0, 3.0, 4.0], - } + dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + b=[1.0, 2.0, 3.0, 4.0], + ) ) - expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]}) + expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0])) df_copy = df.copy() df_copy[columns] = df_copy[columns].apply(to_numeric) @@ -208,10 +208,10 @@ def test_numeric_df_columns(columns): ) def test_numeric_embedded_arr_likes(data, exp_data): # Test to_numeric with embedded lists and arrays - df = DataFrame({"a": data}) + df = DataFrame(dict(a=data)) df["a"] = df["a"].apply(to_numeric) - expected = DataFrame({"a": exp_data}) + expected = DataFrame(dict(a=exp_data)) tm.assert_frame_equal(df, expected) @@ -226,7 +226,7 @@ def test_all_nan(): def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) - kwargs = {"errors": errors} if errors is not None else {} + kwargs = dict(errors=errors) if errors is not None else dict() error_ctx = pytest.raises(TypeError, match="1-d array") with error_ctx: @@ -241,7 +241,7 @@ def test_scalar(val, signed, transform): def test_really_large_scalar(large_val, signed, transform, errors): # see gh-24910 - kwargs = {"errors": errors} if errors is not None else {} + kwargs = dict(errors=errors) if errors is not None else dict() val = -large_val if signed else large_val val = transform(val) @@ -258,7 +258,7 @@ def test_really_large_scalar(large_val, signed, transform, errors): def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): # see gh-24910 - kwargs = {"errors": errors} if errors is not None else {} + kwargs = dict(errors=errors) if errors is not None else dict() val = -large_val if signed else large_val val = transform(val) @@ -300,7 +300,7 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors # # Even if we discover that we have to hold float, does not mean # we should be lenient on subsequent elements that fail to be integer. - kwargs = {"errors": errors} if errors is not None else {} + kwargs = dict(errors=errors) if errors is not None else dict() arr = [str(-large_val if signed else large_val)] if multiple_elts: @@ -452,12 +452,12 @@ def test_errors_invalid_value(): "kwargs,exp_dtype", [ # Basic function tests. - ({}, np.int64), - ({"downcast": None}, np.int64), + (dict(), np.int64), + (dict(downcast=None), np.int64), # Support below np.float32 is rare and far between. - ({"downcast": "float"}, np.dtype(np.float32).char), + (dict(downcast="float"), np.dtype(np.float32).char), # Basic dtype support. - ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])), + (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])), ], ) def test_downcast_basic(data, kwargs, exp_dtype): diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 0fb1da777e357..a2c146dbd65e8 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -210,16 +210,16 @@ def test_argument_types(transform): @pytest.mark.parametrize( "name,kwargs", [ - ("One-Time", {"year": 2012, "month": 5, "day": 28}), + ("One-Time", dict(year=2012, month=5, day=28)), ( "Range", - { - "month": 5, - "day": 28, - "start_date": datetime(2012, 1, 1), - "end_date": datetime(2012, 12, 31), - "offset": DateOffset(weekday=MO(1)), - }, + dict( + month=5, + day=28, + start_date=datetime(2012, 1, 1), + end_date=datetime(2012, 12, 31), + offset=DateOffset(weekday=MO(1)), + ), ), ], ) diff --git a/pandas/tests/tseries/holiday/test_observance.py b/pandas/tests/tseries/holiday/test_observance.py index 83038ad254b77..9ee63d2a36556 100644 --- a/pandas/tests/tseries/holiday/test_observance.py +++ b/pandas/tests/tseries/holiday/test_observance.py @@ -22,7 +22,6 @@ _SUNDAY = datetime(2014, 4, 13) _MONDAY = datetime(2014, 4, 14) _TUESDAY = datetime(2014, 4, 15) -_NEXT_WEDNESDAY = datetime(2014, 4, 16) @pytest.mark.parametrize("day", [_SATURDAY, _SUNDAY]) @@ -61,15 +60,7 @@ def test_weekend_to_monday(day, expected): @pytest.mark.parametrize( - "day,expected", - [ - (_WEDNESDAY, _THURSDAY), - (_THURSDAY, _FRIDAY), - (_SATURDAY, _MONDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _TUESDAY), - (_TUESDAY, _NEXT_WEDNESDAY), # WED is same week as TUE - ], + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _TUESDAY)] ) def test_next_workday(day, expected): assert next_workday(day) == expected @@ -83,16 +74,7 @@ def test_previous_workday(day, expected): @pytest.mark.parametrize( - "day,expected", - [ - (_THURSDAY, _WEDNESDAY), - (_FRIDAY, _THURSDAY), - (_SATURDAY, _THURSDAY), - (_SUNDAY, _FRIDAY), - (_MONDAY, _FRIDAY), # last week Friday - (_TUESDAY, _MONDAY), - (_NEXT_WEDNESDAY, _TUESDAY), # WED is same week as TUE - ], + "day,expected", [(_SATURDAY, _THURSDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] ) def test_before_nearest_workday(day, expected): assert before_nearest_workday(day) == expected diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 1ac98247780b7..fca1316493e85 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -4191,8 +4191,8 @@ class TestDST: # test both basic names and dateutil timezones timezone_utc_offsets = { - "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5}, - "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8}, + "US/Eastern": dict(utc_offset_daylight=-4, utc_offset_standard=-5), + "dateutil/US/Pacific": dict(utc_offset_daylight=-7, utc_offset_standard=-8), } valid_date_offsets_singular = [ "weekday", diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 5b1134ee85e2c..93e5e2c801c09 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -131,15 +131,15 @@ def test_to_offset_leading_plus(freqstr, expected): @pytest.mark.parametrize( "kwargs,expected", [ - ({"days": 1, "seconds": 1}, offsets.Second(86401)), - ({"days": -1, "seconds": 1}, offsets.Second(-86399)), - ({"hours": 1, "minutes": 10}, offsets.Minute(70)), - ({"hours": 1, "minutes": -10}, offsets.Minute(50)), - ({"weeks": 1}, offsets.Day(7)), - ({"hours": 1}, offsets.Hour(1)), - ({"hours": 1}, to_offset("60min")), - ({"microseconds": 1}, offsets.Micro(1)), - ({"microseconds": 0}, offsets.Nano(0)), + (dict(days=1, seconds=1), offsets.Second(86401)), + (dict(days=-1, seconds=1), offsets.Second(-86399)), + (dict(hours=1, minutes=10), offsets.Minute(70)), + (dict(hours=1, minutes=-10), offsets.Minute(50)), + (dict(weeks=1), offsets.Day(7)), + (dict(hours=1), offsets.Hour(1)), + (dict(hours=1), to_offset("60min")), + (dict(microseconds=1), offsets.Micro(1)), + (dict(microseconds=0), offsets.Nano(0)), ], ) def test_to_offset_pd_timedelta(kwargs, expected): diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 29a0805bceb98..8957e7a172666 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -16,7 +16,7 @@ def test_categorical_equal(c): def test_categorical_equal_order_mismatch(check_category_order): c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1]) - kwargs = {"check_category_order": check_category_order} + kwargs = dict(check_category_order=check_category_order) if check_category_order: msg = """Categorical\\.categories are different diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index 545f0dcbf695f..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -9,9 +9,9 @@ @pytest.mark.parametrize( "kwargs", [ - {}, # Default is check_exact=False - {"check_exact": False}, - {"check_exact": True}, + dict(), # Default is check_exact=False + dict(check_exact=False), + dict(check_exact=True), ], ) def test_assert_extension_array_equal_not_exact(kwargs): @@ -55,7 +55,7 @@ def test_assert_extension_array_equal_less_precise(decimals): def test_assert_extension_array_equal_dtype_mismatch(check_dtype): end = 5 - kwargs = {"check_dtype": check_dtype} + kwargs = dict(check_dtype=check_dtype) arr1 = SparseArray(np.arange(end, dtype="int64")) arr2 = SparseArray(np.arange(end, dtype="int32")) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 8034ace479a62..d5161ce37494b 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -120,7 +120,7 @@ def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): ], ) def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type): - kwargs = {"check_index_type": check_index_type} + kwargs = dict(check_index_type=check_index_type) if check_index_type: with pytest.raises(AssertionError, match=msg): @@ -134,7 +134,7 @@ def test_empty_dtypes(check_dtype): df1 = DataFrame(columns=columns) df2 = DataFrame(columns=columns) - kwargs = {"check_dtype": check_dtype} + kwargs = dict(check_dtype=check_dtype) df1["col1"] = df1["col1"].astype("int64") if check_dtype: @@ -272,20 +272,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): tm.assert_frame_equal(left, right, check_dtype=False) -@pytest.mark.parametrize( - "dtype", - [ - ("timedelta64[ns]"), - ("datetime64[ns, UTC]"), - ("Period[D]"), - ], -) -def test_assert_frame_equal_datetime_like_dtype_mismatch(dtype): - df1 = DataFrame({"a": []}, dtype=dtype) - df2 = DataFrame({"a": []}) - tm.assert_frame_equal(df1, df2, check_dtype=False) - - def test_allows_duplicate_labels(): left = DataFrame() right = DataFrame().set_flags(allows_duplicate_labels=False) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 988a0e7b24379..21d5a456e20d0 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -85,7 +85,7 @@ def test_index_equal_values_close(check_exact): def test_index_equal_values_less_close(check_exact, rtol): idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0001]) - kwargs = {"check_exact": check_exact, "rtol": rtol} + kwargs = dict(check_exact=check_exact, rtol=rtol) if check_exact or rtol < 0.5e-3: msg = """Index are different @@ -103,7 +103,7 @@ def test_index_equal_values_less_close(check_exact, rtol): def test_index_equal_values_too_far(check_exact, rtol): idx1 = Index([1, 2, 3]) idx2 = Index([1, 2, 4]) - kwargs = {"check_exact": check_exact, "rtol": rtol} + kwargs = dict(check_exact=check_exact, rtol=rtol) msg = """Index are different @@ -140,7 +140,7 @@ def test_index_equal_value_oder_mismatch(check_exact, rtol, check_order): def test_index_equal_level_values_mismatch(check_exact, rtol): idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) - kwargs = {"check_exact": check_exact, "rtol": rtol} + kwargs = dict(check_exact=check_exact, rtol=rtol) msg = """MultiIndex level \\[1\\] are different diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index 8cc4ade3d7e95..2e8699536c72a 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -7,9 +7,9 @@ @pytest.mark.parametrize( "kwargs", [ - {"start": 0, "periods": 4}, - {"start": 1, "periods": 5}, - {"start": 5, "end": 10, "closed": "left"}, + dict(start=0, periods=4), + dict(start=1, periods=5), + dict(start=5, end=10, closed="left"), ], ) def test_interval_array_equal(kwargs): @@ -18,7 +18,7 @@ def test_interval_array_equal(kwargs): def test_interval_array_equal_closed_mismatch(): - kwargs = {"start": 0, "periods": 5} + kwargs = dict(start=0, periods=5) arr1 = interval_range(closed="left", **kwargs).values arr2 = interval_range(closed="right", **kwargs).values @@ -34,7 +34,7 @@ def test_interval_array_equal_closed_mismatch(): def test_interval_array_equal_periods_mismatch(): - kwargs = {"start": 0} + kwargs = dict(start=0) arr1 = interval_range(periods=5, **kwargs).values arr2 = interval_range(periods=6, **kwargs).values @@ -50,7 +50,7 @@ def test_interval_array_equal_periods_mismatch(): def test_interval_array_equal_end_mismatch(): - kwargs = {"start": 0, "periods": 5} + kwargs = dict(start=0, periods=5) arr1 = interval_range(end=10, **kwargs).values arr2 = interval_range(end=20, **kwargs).values @@ -66,7 +66,7 @@ def test_interval_array_equal_end_mismatch(): def test_interval_array_equal_start_mismatch(): - kwargs = {"periods": 4} + kwargs = dict(periods=4) arr1 = interval_range(start=0, **kwargs).values arr2 = interval_range(start=1, **kwargs).values diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index ae4523014b01d..0f56fb0b93642 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -87,9 +87,9 @@ def test_series_not_equal_value_mismatch(data1, data2): @pytest.mark.parametrize( "kwargs", [ - {"dtype": "float64"}, # dtype mismatch - {"index": [1, 2, 4]}, # index mismatch - {"name": "foo"}, # name mismatch + dict(dtype="float64"), # dtype mismatch + dict(index=[1, 2, 4]), # index mismatch + dict(name="foo"), # name mismatch ], ) def test_series_not_equal_metadata_mismatch(kwargs): @@ -140,7 +140,7 @@ def test_less_precise(data1, data2, dtype, decimals): ], ) def test_series_equal_index_dtype(s1, s2, msg, check_index_type): - kwargs = {"check_index_type": check_index_type} + kwargs = dict(check_index_type=check_index_type) if check_index_type: with pytest.raises(AssertionError, match=msg): diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 4ea3ebe5000ad..fe5fc3e21d960 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -39,7 +39,7 @@ def test_show_versions(capsys): assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result) # check required dependency - assert re.search(r"numpy\s*:\s([0-9\.\+a-f\_]|dev)+\n", result) + assert re.search(r"numpy\s*:\s([0-9\.\+a-f]|dev)+\n", result) # check optional dependency assert re.search(r"pyarrow\s*:\s([0-9\.]+|None)\n", result) diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index db532480efe07..746d859b3322e 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -30,7 +30,7 @@ def test_bad_arg_length_max_value_single(): def test_bad_arg_length_max_value_multiple(): args = (None, None) - compat_args = {"foo": None} + compat_args = dict(foo=None) min_fname_arg_count = 2 max_length = len(compat_args) + min_fname_arg_count @@ -61,7 +61,7 @@ def test_not_all_defaults(i): def test_validation(): # No exceptions should be raised. - validate_args(_fname, (None,), 2, {"out": None}) + validate_args(_fname, (None,), 2, dict(out=None)) compat_args = {"axis": 1, "out": None} validate_args(_fname, (1, None), 2, compat_args) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index c357affb6203d..8fe2a3712bf49 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -41,7 +41,7 @@ def test_validation(): # No exceptions should be raised. compat_args = {"f": None, "b": 1, "ba": "s"} - kwargs = {"f": None, "b": 1} + kwargs = dict(f=None, b=1) validate_kwargs(_fname, kwargs, compat_args) diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py index 57665b47dea7f..aa3453680190b 100644 --- a/pandas/tests/window/moments/test_moments_consistency_ewm.py +++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py @@ -11,6 +11,7 @@ def test_ewm_pairwise_cov_corr(func, frame): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5]) + expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 53e5354340dcc..802ece77fd36d 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -51,6 +51,7 @@ def test_rolling_pairwise_cov_corr(func, frame): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5]) + expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b89fb35ac3a70..f9b5a5fe9a3c1 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1,15 +1,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - Timestamp, - date_range, - to_datetime, -) +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, to_datetime import pandas._testing as tm from pandas.api.indexers import BaseIndexer from pandas.core.groupby.groupby import get_groupby @@ -426,23 +418,12 @@ def test_groupby_rolling_empty_frame(self): # GH 36197 expected = DataFrame({"s1": []}) result = expected.groupby("s1").rolling(window=1).sum() - # GH-38057 from_tuples gives empty object dtype, we now get float/int levels - # expected.index = MultiIndex.from_tuples([], names=["s1", None]) - expected.index = MultiIndex.from_product( - [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None] - ) + expected.index = MultiIndex.from_tuples([], names=["s1", None]) tm.assert_frame_equal(result, expected) expected = DataFrame({"s1": [], "s2": []}) result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() - expected.index = MultiIndex.from_product( - [ - Index([], dtype="float64"), - Index([], dtype="float64"), - Index([], dtype="int64"), - ], - names=["s1", "s2", None], - ) + expected.index = MultiIndex.from_tuples([], names=["s1", "s2", None]) tm.assert_frame_equal(result, expected) def test_groupby_rolling_string_index(self): @@ -586,60 +567,6 @@ def test_groupby_rolling_index_level_and_column_label(self): ) tm.assert_frame_equal(result, expected) - def test_groupby_rolling_resulting_multiindex(self): - # a few different cases checking the created MultiIndex of the result - # https://github.com/pandas-dev/pandas/pull/38057 - - # grouping by 1 columns -> 2-level MI as result - df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4}) - result = df.groupby("b").rolling(3).mean() - expected_index = MultiIndex.from_tuples( - [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)], - names=["b", None], - ) - tm.assert_index_equal(result.index, expected_index) - - # grouping by 2 columns -> 3-level MI as result - df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3}) - result = df.groupby(["b", "c"]).rolling(2).sum() - expected_index = MultiIndex.from_tuples( - [ - (1, 1, 0), - (1, 1, 4), - (1, 1, 8), - (1, 3, 2), - (1, 3, 6), - (1, 3, 10), - (2, 2, 1), - (2, 2, 5), - (2, 2, 9), - (2, 4, 3), - (2, 4, 7), - (2, 4, 11), - ], - names=["b", "c", None], - ) - tm.assert_index_equal(result.index, expected_index) - - # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result - df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2}) - df = df.set_index("c", append=True) - result = df.groupby("b").rolling(3).mean() - expected_index = MultiIndex.from_tuples( - [ - (1, 0, 1), - (1, 2, 3), - (1, 4, 1), - (1, 6, 3), - (2, 1, 2), - (2, 3, 4), - (2, 5, 2), - (2, 7, 4), - ], - names=["b", None, "c"], - ) - tm.assert_index_equal(result.index, expected_index) - class TestExpanding: def setup_method(self): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 10b23cadfe279..1658cca347786 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1085,15 +1085,8 @@ def test_groupby_rolling_nan_included(): result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean() expected = DataFrame( {"B": [0.0, 2.0, 3.0, 1.0, 4.0]}, - # GH-38057 from_tuples puts the NaNs in the codes, result expects them - # to be in the levels, at the moment - # index=MultiIndex.from_tuples( - # [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], - # names=["group", None], - # ), - index=MultiIndex( - [["g1", "g2", np.nan], [0, 1, 2, 3, 4]], - [[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]], + index=MultiIndex.from_tuples( + [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], names=["group", None], ), ) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 5256cc29d5543..72003eeddf5ee 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -106,7 +106,7 @@ def show_versions(as_json: Union[str, bool] = False) -> None: deps = _get_dependency_info() if as_json: - j = {"system": sys_info, "dependencies": deps} + j = dict(system=sys_info, dependencies=deps) if as_json is True: print(j) From a0262ab0b4f5be9b452412c9862184fabed9ad9d Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 12:53:05 +0800 Subject: [PATCH 26/42] Revert "fix doc" This reverts commit b49229367fc3ab02e81c8c373d05c021560054f2. --- doc/source/user_guide/timeseries.rst | 28 ++++++++++++-------------- doc/source/whatsnew/v1.2.0.rst | 30 +++++++++++++++++++--------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 843da644848b1..bee72ec70d95e 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1888,39 +1888,31 @@ Those two examples are equivalent for this time series: Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. -.. _timeseries.backward-resample: - Backward resample ~~~~~~~~~~~~~~~~~ .. versionadded:: 1.2.0 -``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq``, but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) +``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) .. ipython:: python start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = pd.date_range(start, end, freq="7min") - ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) -Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``. +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . ts.index.max() ts.resample("17min", origin="end").sum() -The forward resample output stands for the grouping result from current datetimeindex to the next one with ``closed=left`` by default. In contrast, the backward resample output stands for the grouping result from former datetimeindex to the current one with ``closed=right`` by default. If you want to change this, ``closed=left`` is available. - -.. ipython:: python - - ts.resample("17min", closed="left", origin="end").sum() - -Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``. +Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . .. ipython:: python - ts.resample("17min", origin="end_day").sum() + ts.resample("17min", origin="end").sum() -If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set. +If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. .. ipython:: python @@ -1934,6 +1926,12 @@ You can implement ``offset='end_day'`` in the following method equivalently. end_day_origin ts.resample("17min", origin=end_day_origin, backward=True).sum() +By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. + +.. ipython:: python + + ts.resample("17min", closed="left", origin="end").sum() + .. _timeseries.periods: Time span representation diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d45813960d5c2..ac8132339d38c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -206,35 +206,47 @@ level-by-level basis. .. _whatsnew_120.backward_resample: -Backward resample +Backward resample ^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward``. ``'end'`` and ``'end_day'`` are available in argument ``offset``. Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) +:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`) .. ipython:: python start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = pd.date_range(start, end, freq="7min") - ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) -Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``. +Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` . ts.index.max() ts.resample("17min", origin="end").sum() -Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``. +Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` . .. ipython:: python - ts.resample("17min", origin="end_day").sum() + ts.resample("17min", origin="end").sum() -If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set. +If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set. .. ipython:: python ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum() -For details, see: :ref:`timeseries.backward-resample`. +You can implement ``offset='end_day'`` in the following method equivalently. + +.. ipython:: python + + end_day_origin = ts.index.max().ceil("D") + end_day_origin + ts.resample("17min", origin=end_day_origin, backward=True).sum() + +By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available. + +.. ipython:: python + + ts.resample("17min", closed="left", origin="end").sum() .. _whatsnew_120.groupby_ewm: From b990c5f5043f0a063f457b8a9f03794c6ffa919d Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 12:53:25 +0800 Subject: [PATCH 27/42] Revert "Merge branch 'master' into master" This reverts commit 0cff41ecb37d5663e7eb6debf98e4b6eea9a4a54, reversing changes made to 77fc4a3fc0acaf1783f7ed87b53da6688b0f6395. --- .github/workflows/ci.yml | 4 +- .gitignore | 1 - .pre-commit-config.yaml | 2 +- .travis.yml | 7 +- Dockerfile | 2 +- Makefile | 2 +- README.md | 2 +- asv_bench/benchmarks/algorithms.py | 12 - asv_bench/benchmarks/categoricals.py | 43 - asv_bench/benchmarks/groupby.py | 2 +- asv_bench/benchmarks/hash_functions.py | 164 --- asv_bench/benchmarks/join_merge.py | 6 - asv_bench/benchmarks/reshape.py | 5 +- asv_bench/benchmarks/rolling.py | 13 - asv_bench/benchmarks/series_methods.py | 73 +- azure-pipelines.yml | 2 +- ci/azure/posix.yml | 5 - ci/azure/windows.yml | 2 +- ci/build39.sh | 12 + ci/check_cache.sh | 27 + ci/code_checks.sh | 2 +- ci/deps/azure-38-locale.yaml | 2 +- ci/deps/azure-39.yaml | 22 - ci/deps/travis-37-locale.yaml | 2 +- ci/run_tests.sh | 2 +- ci/setup_env.sh | 13 +- doc/source/development/contributing.rst | 53 +- .../development/contributing_docstring.rst | 10 +- doc/source/development/extending.rst | 2 +- doc/source/development/index.rst | 1 - doc/source/development/policies.rst | 2 +- doc/source/development/test_writing.rst | 174 --- doc/source/ecosystem.rst | 14 +- doc/source/getting_started/install.rst | 2 +- .../intro_tutorials/04_plotting.rst | 4 +- doc/source/reference/index.rst | 1 + doc/source/reference/panel.rst | 10 + doc/source/reference/style.rst | 1 - doc/source/reference/window.rst | 28 +- doc/source/user_guide/10min.rst | 4 +- doc/source/user_guide/basics.rst | 10 +- doc/source/user_guide/computation.rst | 989 +++++++++++++++++- doc/source/user_guide/cookbook.rst | 7 +- doc/source/user_guide/dsintro.rst | 2 +- doc/source/user_guide/enhancingperf.rst | 8 +- doc/source/user_guide/groupby.rst | 15 +- doc/source/user_guide/index.rst | 1 - doc/source/user_guide/indexing.rst | 78 +- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/io.rst | 9 +- doc/source/user_guide/merging.rst | 9 +- doc/source/user_guide/options.rst | 8 +- doc/source/user_guide/sparse.rst | 2 +- doc/source/user_guide/style.ipynb | 34 +- doc/source/user_guide/timeseries.rst | 28 +- doc/source/user_guide/window.rst | 593 ----------- doc/source/whatsnew/v0.12.0.rst | 6 +- doc/source/whatsnew/v0.14.0.rst | 4 +- doc/source/whatsnew/v0.15.0.rst | 6 +- doc/source/whatsnew/v0.15.2.rst | 2 +- doc/source/whatsnew/v0.16.1.rst | 4 +- doc/source/whatsnew/v0.16.2.rst | 2 +- doc/source/whatsnew/v0.18.0.rst | 4 +- doc/source/whatsnew/v0.19.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 12 +- doc/source/whatsnew/v0.21.0.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 4 +- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.6.1.rst | 4 +- doc/source/whatsnew/v0.8.0.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 4 +- doc/source/whatsnew/v1.1.5.rst | 24 +- doc/source/whatsnew/v1.2.0.rst | 427 +++----- environment.yml | 3 - pandas/__init__.py | 19 +- pandas/_libs/groupby.pyx | 30 +- pandas/_libs/hashtable.pxd | 56 - pandas/_libs/hashtable.pyx | 44 +- pandas/_libs/hashtable_class_helper.pxi.in | 98 +- pandas/_libs/hashtable_func_helper.pxi.in | 18 +- pandas/_libs/index_class_helper.pxi.in | 30 +- pandas/_libs/interval.pyx | 3 +- pandas/_libs/khash.pxd | 83 +- .../_libs/khash_for_primitive_helper.pxi.in | 42 - pandas/_libs/lib.pyx | 14 +- pandas/_libs/reduction.pyx | 4 +- pandas/_libs/src/klib/khash.h | 181 +--- pandas/_libs/src/klib/khash_python.h | 124 +-- pandas/_libs/src/parser/tokenizer.c | 81 +- pandas/_libs/tslibs/offsets.pyx | 26 - pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/tzconversion.pyx | 6 +- pandas/_libs/window/aggregations.pyx | 31 +- pandas/_testing.py | 39 +- pandas/_typing.py | 5 + pandas/_version.py | 301 ++---- pandas/compat/_optional.py | 2 +- pandas/conftest.py | 48 +- pandas/core/algorithms.py | 107 +- pandas/core/apply.py | 33 +- pandas/core/arraylike.py | 144 +-- pandas/core/arrays/_mixins.py | 92 +- pandas/core/arrays/base.py | 47 +- pandas/core/arrays/categorical.py | 75 +- pandas/core/arrays/datetimelike.py | 98 +- pandas/core/arrays/datetimes.py | 12 +- pandas/core/arrays/floating.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/interval.py | 305 +++--- pandas/core/arrays/masked.py | 14 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/period.py | 1 - pandas/core/arrays/sparse/array.py | 16 +- pandas/core/arrays/string_.py | 4 + pandas/core/arrays/string_arrow.py | 625 ----------- pandas/core/arrays/timedeltas.py | 4 +- pandas/core/base.py | 21 +- pandas/core/common.py | 45 +- pandas/core/computation/align.py | 14 +- pandas/core/computation/parsing.py | 8 +- pandas/core/computation/pytables.py | 4 - pandas/core/construction.py | 6 +- pandas/core/dtypes/base.py | 5 +- pandas/core/dtypes/cast.py | 10 +- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/concat.py | 6 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/generic.py | 22 +- pandas/core/frame.py | 204 ++-- pandas/core/generic.py | 209 ++-- pandas/core/groupby/base.py | 1 - pandas/core/groupby/generic.py | 18 +- pandas/core/groupby/groupby.py | 44 +- pandas/core/groupby/ops.py | 5 +- pandas/core/indexers.py | 2 +- pandas/core/indexes/base.py | 142 +-- pandas/core/indexes/category.py | 123 ++- pandas/core/indexes/datetimelike.py | 230 ++-- pandas/core/indexes/datetimes.py | 52 +- pandas/core/indexes/extension.py | 64 +- pandas/core/indexes/interval.py | 164 +-- pandas/core/indexes/multi.py | 272 ++--- pandas/core/indexes/numeric.py | 97 +- pandas/core/indexes/period.py | 82 +- pandas/core/indexes/range.py | 24 +- pandas/core/indexes/timedeltas.py | 31 +- pandas/core/indexing.py | 196 ++-- pandas/core/internals/blocks.py | 313 +++--- pandas/core/internals/concat.py | 6 +- pandas/core/internals/construction.py | 5 +- pandas/core/internals/managers.py | 59 +- pandas/core/nanops.py | 2 +- pandas/core/ops/array_ops.py | 8 +- pandas/core/resample.py | 4 +- pandas/core/reshape/concat.py | 28 +- pandas/core/reshape/merge.py | 81 +- pandas/core/reshape/pivot.py | 3 +- pandas/core/reshape/reshape.py | 12 +- pandas/core/series.py | 98 +- pandas/core/shared_docs.py | 63 -- pandas/core/sorting.py | 23 +- pandas/core/strings/accessor.py | 3 +- pandas/core/tools/numeric.py | 6 +- pandas/core/tools/timedeltas.py | 5 - pandas/core/window/__init__.py | 5 +- pandas/core/window/common.py | 4 - pandas/core/window/ewm.py | 158 +-- pandas/core/window/indexers.py | 15 - pandas/core/window/numba_.py | 89 -- pandas/core/window/rolling.py | 48 +- pandas/io/common.py | 201 ++-- pandas/io/excel/_base.py | 150 ++- pandas/io/excel/_odfreader.py | 4 +- pandas/io/excel/_odswriter.py | 16 +- pandas/io/excel/_openpyxl.py | 24 +- pandas/io/excel/_pyxlsb.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- pandas/io/excel/_xlsxwriter.py | 7 +- pandas/io/excel/_xlwt.py | 17 +- pandas/io/feather_format.py | 45 +- pandas/io/formats/console.py | 2 +- pandas/io/formats/csvs.py | 36 +- pandas/io/formats/excel.py | 173 ++- pandas/io/formats/format.py | 74 +- pandas/io/formats/info.py | 489 ++++----- pandas/io/formats/printing.py | 2 +- pandas/io/formats/style.py | 90 +- pandas/io/json/_json.py | 125 ++- pandas/io/orc.py | 12 +- pandas/io/parquet.py | 196 ++-- pandas/io/parsers.py | 252 ++--- pandas/io/pickle.py | 110 +- pandas/io/pytables.py | 5 +- pandas/io/sas/sas7bdat.py | 15 +- pandas/io/sas/sas_xport.py | 20 +- pandas/io/sas/sasreader.py | 17 +- pandas/io/sql.py | 2 +- pandas/io/stata.py | 256 +++-- pandas/plotting/_matplotlib/boxplot.py | 4 +- pandas/plotting/_matplotlib/converter.py | 2 +- pandas/plotting/_matplotlib/core.py | 31 +- pandas/plotting/_matplotlib/tools.py | 10 +- pandas/tests/arithmetic/conftest.py | 11 +- pandas/tests/arithmetic/test_datetime64.py | 27 +- pandas/tests/arithmetic/test_interval.py | 2 +- pandas/tests/arithmetic/test_numeric.py | 102 +- pandas/tests/arithmetic/test_period.py | 128 +-- pandas/tests/arithmetic/test_timedelta64.py | 2 +- .../arrays/categorical/test_analytics.py | 6 +- pandas/tests/arrays/categorical/test_api.py | 5 +- .../arrays/categorical/test_constructors.py | 19 +- .../tests/arrays/categorical/test_dtypes.py | 4 +- pandas/tests/arrays/categorical/test_take.py | 2 +- .../arrays/floating/test_construction.py | 2 +- .../tests/arrays/integer/test_construction.py | 2 +- pandas/tests/arrays/interval/test_astype.py | 23 - pandas/tests/arrays/sparse/test_array.py | 2 +- pandas/tests/arrays/sparse/test_dtype.py | 4 +- pandas/tests/arrays/string_/test_string.py | 383 ++----- .../tests/arrays/string_/test_string_arrow.py | 26 - pandas/tests/arrays/test_datetimelike.py | 104 +- pandas/tests/arrays/test_period.py | 3 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/base/test_misc.py | 2 +- pandas/tests/base/test_value_counts.py | 8 +- pandas/tests/dtypes/test_generic.py | 1 + pandas/tests/dtypes/test_inference.py | 19 +- pandas/tests/extension/test_external_block.py | 2 +- pandas/tests/extension/test_interval.py | 10 +- pandas/tests/extension/test_sparse.py | 2 +- pandas/tests/extension/test_string.py | 58 +- pandas/tests/frame/apply/test_frame_apply.py | 60 +- .../tests/frame/apply/test_frame_transform.py | 2 + pandas/tests/frame/conftest.py | 5 - pandas/tests/frame/indexing/test_getitem.py | 23 - pandas/tests/frame/indexing/test_indexing.py | 23 +- pandas/tests/frame/indexing/test_setitem.py | 34 +- pandas/tests/frame/indexing/test_xs.py | 30 - pandas/tests/frame/methods/test_describe.py | 2 +- pandas/tests/frame/methods/test_dtypes.py | 18 +- pandas/tests/frame/methods/test_fillna.py | 15 - pandas/tests/frame/methods/test_reindex.py | 29 +- pandas/tests/frame/methods/test_replace.py | 48 +- .../tests/frame/methods/test_reset_index.py | 2 +- .../tests/frame/methods/test_select_dtypes.py | 26 +- pandas/tests/frame/methods/test_to_csv.py | 10 +- pandas/tests/frame/methods/test_to_dict.py | 4 +- pandas/tests/frame/test_alter_axes.py | 103 +- pandas/tests/frame/test_constructors.py | 74 +- pandas/tests/frame/test_logical_ops.py | 36 - pandas/tests/frame/test_nonunique_indexes.py | 30 +- pandas/tests/frame/test_reductions.py | 150 +-- pandas/tests/frame/test_repr_info.py | 8 - pandas/tests/frame/test_stack_unstack.py | 28 +- pandas/tests/frame/test_ufunc.py | 111 -- pandas/tests/generic/test_duplicate_labels.py | 8 +- pandas/tests/generic/test_finalize.py | 31 +- pandas/tests/generic/test_generic.py | 69 +- pandas/tests/generic/test_logical_ops.py | 49 + pandas/tests/generic/test_to_xarray.py | 88 +- .../tests/groupby/aggregate/test_aggregate.py | 23 +- pandas/tests/groupby/test_allowlist.py | 1 - pandas/tests/groupby/test_categorical.py | 54 +- pandas/tests/groupby/test_groupby.py | 33 +- pandas/tests/groupby/test_missing.py | 10 - pandas/tests/groupby/test_nth.py | 24 - pandas/tests/groupby/test_timegrouper.py | 58 +- .../tests/groupby/transform/test_transform.py | 20 +- .../tests/indexes/base_class/test_formats.py | 134 --- .../tests/indexes/base_class/test_setops.py | 110 +- .../indexes/categorical/test_category.py | 251 +++-- .../tests/indexes/categorical/test_equals.py | 77 -- .../tests/indexes/categorical/test_formats.py | 26 +- .../indexes/categorical/test_indexing.py | 54 +- pandas/tests/indexes/categorical/test_map.py | 12 +- .../tests/indexes/categorical/test_reindex.py | 2 +- pandas/tests/indexes/common.py | 161 ++- pandas/tests/indexes/conftest.py | 2 +- pandas/tests/indexes/datetimelike.py | 35 +- pandas/tests/indexes/datetimes/test_astype.py | 6 +- .../indexes/datetimes/test_constructors.py | 24 +- .../indexes/datetimes/test_date_range.py | 28 +- .../tests/indexes/datetimes/test_datetime.py | 26 +- .../tests/indexes/datetimes/test_indexing.py | 58 +- pandas/tests/indexes/datetimes/test_misc.py | 44 +- pandas/tests/indexes/datetimes/test_ops.py | 71 +- .../indexes/datetimes/test_partial_slicing.py | 13 +- pandas/tests/indexes/datetimes/test_setops.py | 24 +- pandas/tests/indexes/datetimes/test_shift.py | 4 +- .../tests/indexes/datetimes/test_timezones.py | 16 +- pandas/tests/indexes/interval/test_astype.py | 16 +- pandas/tests/indexes/interval/test_base.py | 54 +- .../indexes/interval/test_constructors.py | 20 +- pandas/tests/indexes/interval/test_equals.py | 33 - .../tests/indexes/interval/test_interval.py | 23 +- pandas/tests/indexes/interval/test_setops.py | 8 +- .../tests/indexes/multi/test_constructors.py | 22 +- pandas/tests/indexes/multi/test_drop.py | 29 - pandas/tests/indexes/multi/test_indexing.py | 12 +- pandas/tests/indexes/multi/test_sorting.py | 10 +- pandas/tests/indexes/numeric/test_indexing.py | 19 +- pandas/tests/indexes/numeric/test_setops.py | 139 --- pandas/tests/indexes/period/test_astype.py | 12 +- pandas/tests/indexes/period/test_indexing.py | 6 +- pandas/tests/indexes/period/test_ops.py | 40 +- .../indexes/period/test_partial_slicing.py | 30 +- .../tests/indexes/ranges/test_constructors.py | 14 +- pandas/tests/indexes/ranges/test_indexing.py | 2 +- pandas/tests/indexes/ranges/test_range.py | 25 + pandas/tests/indexes/ranges/test_setops.py | 25 +- pandas/tests/indexes/test_any_index.py | 14 - pandas/tests/indexes/test_base.py | 360 ++++++- pandas/tests/indexes/test_common.py | 163 ++- pandas/tests/indexes/test_datetimelike.py | 174 --- pandas/tests/indexes/test_indexing.py | 54 +- pandas/tests/indexes/test_numeric.py | 168 ++- pandas/tests/indexes/test_setops.py | 294 +----- .../tests/indexes/timedeltas/test_astype.py | 4 +- .../indexes/timedeltas/test_constructors.py | 4 +- .../tests/indexes/timedeltas/test_indexing.py | 2 +- pandas/tests/indexes/timedeltas/test_ops.py | 43 +- .../indexes/timedeltas/test_scalar_compat.py | 3 +- .../tests/indexes/timedeltas/test_setops.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 6 + pandas/tests/indexing/common.py | 2 +- .../tests/indexing/interval/test_interval.py | 10 +- .../indexing/interval/test_interval_new.py | 12 +- pandas/tests/indexing/multiindex/test_loc.py | 72 -- .../tests/indexing/multiindex/test_partial.py | 48 +- .../tests/indexing/multiindex/test_setitem.py | 11 +- .../tests/indexing/multiindex/test_slice.py | 79 +- pandas/tests/indexing/test_at.py | 29 +- pandas/tests/indexing/test_categorical.py | 108 +- .../indexing/test_chaining_and_caching.py | 18 +- pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_datetime.py | 47 +- pandas/tests/indexing/test_floats.py | 121 +-- pandas/tests/indexing/test_iat.py | 15 +- pandas/tests/indexing/test_iloc.py | 86 +- pandas/tests/indexing/test_indexing.py | 143 +-- pandas/tests/indexing/test_loc.py | 143 +-- pandas/tests/indexing/test_partial.py | 10 +- pandas/tests/indexing/test_scalar.py | 32 +- pandas/tests/internals/test_internals.py | 31 +- pandas/tests/io/conftest.py | 2 +- pandas/tests/io/excel/test_writers.py | 17 +- pandas/tests/io/excel/test_xlrd.py | 2 +- .../data/html/various_dtypes_formatted.html | 36 - pandas/tests/io/formats/test_format.py | 47 +- pandas/tests/io/formats/test_info.py | 119 ++- pandas/tests/io/formats/test_style.py | 22 - pandas/tests/io/formats/test_to_csv.py | 15 +- pandas/tests/io/formats/test_to_html.py | 15 - pandas/tests/io/json/test_pandas.py | 14 +- pandas/tests/io/parser/test_compression.py | 21 +- pandas/tests/io/parser/test_read_fwf.py | 47 +- pandas/tests/io/pytables/test_store.py | 20 +- pandas/tests/io/pytables/test_timezones.py | 30 +- pandas/tests/io/test_clipboard.py | 2 +- pandas/tests/io/test_common.py | 19 +- pandas/tests/io/test_compression.py | 23 +- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_fsspec.py | 65 +- pandas/tests/io/test_gcs.py | 57 +- pandas/tests/io/test_html.py | 4 +- pandas/tests/io/test_parquet.py | 94 +- pandas/tests/io/test_sql.py | 22 +- pandas/tests/libs/test_hashtable.py | 336 ------ pandas/tests/plotting/frame/test_frame.py | 86 -- pandas/tests/plotting/test_converter.py | 23 +- pandas/tests/plotting/test_datetimelike.py | 44 +- pandas/tests/plotting/test_groupby.py | 4 +- pandas/tests/plotting/test_series.py | 23 - pandas/tests/reductions/test_reductions.py | 10 +- pandas/tests/resample/test_datetime_index.py | 31 +- pandas/tests/resample/test_period_index.py | 10 +- pandas/tests/resample/test_resample_api.py | 4 +- pandas/tests/reshape/concat/test_concat.py | 26 +- pandas/tests/reshape/concat/test_dataframe.py | 11 - pandas/tests/reshape/concat/test_series.py | 4 +- pandas/tests/reshape/merge/test_join.py | 14 +- pandas/tests/reshape/merge/test_merge.py | 20 +- .../tests/reshape/merge/test_merge_cross.py | 95 -- pandas/tests/reshape/test_get_dummies.py | 2 +- pandas/tests/reshape/test_pivot.py | 51 +- pandas/tests/scalar/period/test_period.py | 4 +- .../tests/scalar/timestamp/test_timestamp.py | 33 +- .../series/accessors/test_cat_accessor.py | 5 +- pandas/tests/series/indexing/test_datetime.py | 3 +- pandas/tests/series/indexing/test_getitem.py | 22 - pandas/tests/series/indexing/test_indexing.py | 4 +- pandas/tests/series/indexing/test_setitem.py | 25 - pandas/tests/series/indexing/test_xs.py | 15 - pandas/tests/series/methods/test_isin.py | 55 - pandas/tests/series/methods/test_replace.py | 30 +- pandas/tests/series/methods/test_shift.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 10 +- pandas/tests/series/methods/test_to_frame.py | 4 +- pandas/tests/series/test_arithmetic.py | 36 +- pandas/tests/series/test_constructors.py | 55 +- pandas/tests/series/test_dtypes.py | 67 +- pandas/tests/series/test_reductions.py | 2 +- pandas/tests/series/test_validate.py | 2 +- pandas/tests/test_algos.py | 90 +- pandas/tests/test_common.py | 8 - pandas/tests/test_downstream.py | 1 + pandas/tests/test_multilevel.py | 39 +- pandas/tests/test_sorting.py | 16 +- pandas/tests/tools/test_to_datetime.py | 31 - pandas/tests/tools/test_to_timedelta.py | 17 - pandas/tests/tslibs/test_array_to_datetime.py | 4 +- pandas/tests/tslibs/test_parsing.py | 4 +- pandas/tests/util/test_assert_almost_equal.py | 2 +- pandas/tests/util/test_hashing.py | 17 +- pandas/tests/window/common.py | 147 +++ pandas/tests/window/conftest.py | 83 +- pandas/tests/window/moments/conftest.py | 77 ++ .../moments/test_moments_consistency_ewm.py | 459 ++++---- .../test_moments_consistency_expanding.py | 424 ++++---- .../test_moments_consistency_rolling.py | 550 +++++----- .../tests/window/moments/test_moments_ewm.py | 12 +- .../window/moments/test_moments_rolling.py | 5 +- pandas/tests/window/test_api.py | 73 +- pandas/tests/window/test_apply.py | 11 + pandas/tests/window/test_ewm.py | 4 +- pandas/tests/window/test_expanding.py | 41 +- .../{test_groupby.py => test_grouper.py} | 303 +++--- pandas/tests/window/test_numba.py | 38 +- pandas/tests/window/test_rolling.py | 173 +-- pandas/tests/window/test_timeseries_window.py | 19 +- .../{test_win_type.py => test_window.py} | 57 +- pandas/tseries/frequencies.py | 27 +- pandas/util/_doctools.py | 20 +- release_stats.sh | 51 + ...check_for_inconsistent_pandas_namespace.py | 49 +- scripts/generate_pip_deps_from_conda.py | 5 +- scripts/validate_rst_title_capitalization.py | 1 + setup.cfg | 6 +- setup.py | 85 +- test.bat | 3 + test.sh | 4 + test_rebuild.sh | 6 + versioneer.py | 854 ++++++--------- web/pandas/community/ecosystem.md | 2 +- 444 files changed, 9213 insertions(+), 14254 deletions(-) delete mode 100644 asv_bench/benchmarks/hash_functions.py create mode 100755 ci/build39.sh create mode 100755 ci/check_cache.sh delete mode 100644 ci/deps/azure-39.yaml delete mode 100644 doc/source/development/test_writing.rst create mode 100644 doc/source/reference/panel.rst delete mode 100644 doc/source/user_guide/window.rst delete mode 100644 pandas/_libs/khash_for_primitive_helper.pxi.in delete mode 100644 pandas/core/arrays/string_arrow.py delete mode 100644 pandas/tests/arrays/interval/test_astype.py delete mode 100644 pandas/tests/arrays/string_/test_string_arrow.py delete mode 100644 pandas/tests/frame/test_ufunc.py create mode 100644 pandas/tests/generic/test_logical_ops.py delete mode 100644 pandas/tests/indexes/base_class/test_formats.py delete mode 100644 pandas/tests/indexes/categorical/test_equals.py delete mode 100644 pandas/tests/indexes/interval/test_equals.py delete mode 100644 pandas/tests/indexes/numeric/test_setops.py delete mode 100644 pandas/tests/indexes/test_datetimelike.py delete mode 100644 pandas/tests/io/formats/data/html/various_dtypes_formatted.html delete mode 100644 pandas/tests/libs/test_hashtable.py delete mode 100644 pandas/tests/reshape/merge/test_merge_cross.py create mode 100644 pandas/tests/window/common.py create mode 100644 pandas/tests/window/moments/conftest.py rename pandas/tests/window/{test_groupby.py => test_grouper.py} (77%) rename pandas/tests/window/{test_win_type.py => test_window.py} (57%) create mode 100755 release_stats.sh create mode 100644 test.bat create mode 100755 test.sh create mode 100755 test_rebuild.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c00cec450c85e..b391871b18245 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: steps: - name: Setting conda path - run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH + run: echo "::add-path::${HOME}/miniconda3/bin" - name: Checkout uses: actions/checkout@v1 @@ -98,7 +98,7 @@ jobs: steps: - name: Setting conda path - run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH + run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" - name: Checkout uses: actions/checkout@v1 diff --git a/.gitignore b/.gitignore index 1661862a5d066..6c3c275c48fb7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,6 @@ *.log *.swp *.pdb -*.zip .project .pydevproject .settings diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 717334bfe1299..f9b396715664a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: name: isort (cython) types: [cython] - repo: https://github.com/asottile/pyupgrade - rev: v2.7.4 + rev: v2.7.3 hooks: - id: pyupgrade args: [--py37-plus] diff --git a/.travis.yml b/.travis.yml index 1ddd886699d38..2bf72bd159fc2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,11 @@ matrix: fast_finish: true include: + - dist: bionic + python: 3.9-dev + env: + - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" + - env: - JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1" services: @@ -89,7 +94,7 @@ install: script: - echo "script start" - echo "$JOB" - - source activate pandas-dev + - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi - ci/run_tests.sh after_script: diff --git a/Dockerfile b/Dockerfile index 5d7a2b9e6b743..b8aff5d671dcf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,5 +43,5 @@ RUN conda env update -n base -f "$pandas_home/environment.yml" # Build C extensions and pandas RUN cd "$pandas_home" \ - && python setup.py build_ext -j 4 \ + && python setup.py build_ext --inplace -j 4 \ && python -m pip install -e . diff --git a/Makefile b/Makefile index 2c968234749f5..4f71df51de360 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean_pyc: -find . -name '*.py[co]' -exec rm {} \; build: clean_pyc - python setup.py build_ext + python setup.py build_ext --inplace lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 diff --git a/README.md b/README.md index 4072faffe3b3a..a2f2f1c04442a 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Here are just a few of the things that pandas does well: and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, - date shifting and lagging + date shifting and lagging. [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 03480ae198345..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,6 @@ from pandas._libs import lib import pandas as pd -from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -175,15 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class RemoveDuplicates: - def setup(self): - N = 10 ** 5 - na = np.arange(int(N / 2)) - self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) - self.right = np.concatenate([na, na]) - - def time_make_duplicates_of_left_unique_in_right(self): - make_duplicates_of_left_unique_in_right(self.left, self.right) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f3b005b704014..a0b24342091ec 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,5 +1,3 @@ -import string -import sys import warnings import numpy as np @@ -69,47 +67,6 @@ def time_existing_series(self): pd.Categorical(self.series) -class AsType: - def setup(self): - N = 10 ** 5 - - random_pick = np.random.default_rng().choice - - categories = { - "str": list(string.ascii_letters), - "int": np.random.randint(2 ** 16, size=154), - "float": sys.maxsize * np.random.random((38,)), - "timestamp": [ - pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) - ], - } - - self.df = pd.DataFrame( - {col: random_pick(cats, N) for col, cats in categories.items()} - ) - - for col in ("int", "float", "timestamp"): - self.df[col + "_as_str"] = self.df[col].astype(str) - - for col in self.df.columns: - self.df[col] = self.df[col].astype("category") - - def astype_str(self): - [self.df[col].astype("str") for col in "int float timestamp".split()] - - def astype_int(self): - [self.df[col].astype("int") for col in "int_as_str timestamp".split()] - - def astype_float(self): - [ - self.df[col].astype("float") - for col in "float_as_str int int_as_str timestamp".split() - ] - - def astype_datetime(self): - self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) - - class Concat: def setup(self): N = 10 ** 5 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ce63ff8badca..22f002e6cb79a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -486,7 +486,7 @@ def setup(self): tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) tmp = np.concatenate((tmp1, tmp2)) arr = np.repeat(tmp, 10) - self.df = DataFrame({"a": arr, "b": arr}) + self.df = DataFrame(dict(a=arr, b=arr)) def time_sum(self): self.df.groupby(["a"])["b"].sum() diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py deleted file mode 100644 index 17bf434acf38a..0000000000000 --- a/asv_bench/benchmarks/hash_functions.py +++ /dev/null @@ -1,164 +0,0 @@ -import numpy as np - -import pandas as pd - - -class IsinAlmostFullWithRandomInt: - params = [ - [np.float64, np.int64, np.uint64, np.object], - range(10, 21), - ] - param_names = ["dtype", "exponent"] - - def setup(self, dtype, exponent): - M = 3 * 2 ** (exponent - 2) - # 0.77-the maximal share of occupied buckets - np.random.seed(42) - self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype) - self.values = np.random.randint(0, M, M).astype(dtype) - self.values_outside = self.values + M - - def time_isin(self, dtype, exponent): - self.s.isin(self.values) - - def time_isin_outside(self, dtype, exponent): - self.s.isin(self.values_outside) - - -class IsinWithRandomFloat: - params = [ - [np.float64, np.object], - [ - 1_300, - 2_000, - 7_000, - 8_000, - 70_000, - 80_000, - 750_000, - 900_000, - ], - ] - param_names = ["dtype", "M"] - - def setup(self, dtype, M): - np.random.seed(42) - self.values = np.random.rand(M) - self.s = pd.Series(self.values).astype(dtype) - np.random.shuffle(self.values) - self.values_outside = self.values + 0.1 - - def time_isin(self, dtype, M): - self.s.isin(self.values) - - def time_isin_outside(self, dtype, M): - self.s.isin(self.values_outside) - - -class IsinWithArangeSorted: - params = [ - [np.float64, np.int64, np.uint64, np.object], - [ - 1_000, - 2_000, - 8_000, - 100_000, - 1_000_000, - ], - ] - param_names = ["dtype", "M"] - - def setup(self, dtype, M): - self.s = pd.Series(np.arange(M)).astype(dtype) - self.values = np.arange(M).astype(dtype) - - def time_isin(self, dtype, M): - self.s.isin(self.values) - - -class IsinWithArange: - params = [ - [np.float64, np.int64, np.uint64, np.object], - [ - 1_000, - 2_000, - 8_000, - ], - [-2, 0, 2], - ] - param_names = ["dtype", "M", "offset_factor"] - - def setup(self, dtype, M, offset_factor): - offset = int(M * offset_factor) - np.random.seed(42) - tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)) - self.s = tmp.astype(dtype) - self.values = np.arange(M).astype(dtype) - - def time_isin(self, dtype, M, offset_factor): - self.s.isin(self.values) - - -class Float64GroupIndex: - # GH28303 - def setup(self): - self.df = pd.date_range( - start="1/1/2018", end="1/2/2018", periods=1e6 - ).to_frame() - self.group_index = np.round(self.df.index.astype(int) / 1e9) - - def time_groupby(self): - self.df.groupby(self.group_index).last() - - -class UniqueAndFactorizeArange: - params = range(4, 16) - param_names = ["exponent"] - - def setup(self, exponent): - a = np.arange(10 ** 4, dtype="float64") - self.a2 = (a + 10 ** exponent).repeat(100) - - def time_factorize(self, exponent): - pd.factorize(self.a2) - - def time_unique(self, exponent): - pd.unique(self.a2) - - -class NumericSeriesIndexing: - - params = [ - (pd.Int64Index, pd.UInt64Index, pd.Float64Index), - (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), - ] - param_names = ["index_dtype", "N"] - - def setup(self, index, N): - vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) - indices = index(vals) - self.data = pd.Series(np.arange(N), index=indices) - - def time_loc_slice(self, index, N): - # trigger building of mapping - self.data.loc[:800] - - -class NumericSeriesIndexingShuffled: - - params = [ - (pd.Int64Index, pd.UInt64Index, pd.Float64Index), - (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), - ] - param_names = ["index_dtype", "N"] - - def setup(self, index, N): - vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) - np.random.seed(42) - np.random.shuffle(vals) - indices = index(vals) - self.data = pd.Series(np.arange(N), index=indices) - - def time_loc_slice(self, index, N): - # trigger building of mapping - self.data.loc[:800] diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index a572b8a70a680..1333b3a0f0560 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,9 +132,6 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) - def time_join_dataframes_cross(self, sort): - self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) - class JoinIndex: def setup(self): @@ -208,9 +205,6 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) - def time_merge_dataframes_cross(self, sort): - merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) - class I8Merge: diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 9cec8a5f7d318..21081ee23a773 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -103,10 +103,7 @@ def setup(self): nidvars = 20 N = 5000 self.letters = list("ABCD") - yrvars = [ - letter + str(num) - for letter, num in product(self.letters, range(1, nyrs + 1)) - ] + yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) self.df["id"] = self.df.index diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 79a33c437ea5c..226b225b47591 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -225,17 +225,4 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() -class GroupbyEWM: - - params = ["cython", "numba"] - param_names = ["engine"] - - def setup(self, engine): - df = pd.DataFrame({"A": range(50), "B": range(50)}) - self.gb_ewm = df.groupby("A").ewm(com=1.0) - - def time_groupby_mean(self, engine): - self.gb_ewm.mean(engine=engine) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 2db46abca119c..258c29c145721 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import Categorical, NaT, Series, date_range +from pandas import NaT, Series, date_range from .pandas_vb_common import tm @@ -36,28 +36,6 @@ def time_isin(self, dtypes): self.s.isin(self.values) -class IsInDatetime64: - def setup(self): - dti = date_range( - start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" - ) - self.ser = Series(dti) - self.subset = self.ser._values[::3] - self.cat_subset = Categorical(self.subset) - - def time_isin(self): - self.ser.isin(self.subset) - - def time_isin_cat_values(self): - self.ser.isin(self.cat_subset) - - def time_isin_mismatched_dtype(self): - self.ser.isin([1, 2]) - - def time_isin_empty(self): - self.ser.isin([]) - - class IsInFloat64: def setup(self): self.small = Series([1, 2], dtype=np.float64) @@ -112,55 +90,6 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) -class IsInLongSeriesLookUpDominates: - params = [ - ["int64", "int32", "float64", "float32", "object"], - [5, 1000], - ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], - ] - param_names = ["dtype", "MaxNumber", "series_type"] - - def setup(self, dtype, MaxNumber, series_type): - N = 10 ** 7 - if series_type == "random_hits": - np.random.seed(42) - array = np.random.randint(0, MaxNumber, N) - if series_type == "random_misses": - np.random.seed(42) - array = np.random.randint(0, MaxNumber, N) + MaxNumber - if series_type == "monotone_hits": - array = np.repeat(np.arange(MaxNumber), N // MaxNumber) - if series_type == "monotone_misses": - array = np.arange(N) + MaxNumber - self.series = Series(array).astype(dtype) - self.values = np.arange(MaxNumber).astype(dtype) - - def time_isin(self, dtypes, MaxNumber, series_type): - self.series.isin(self.values) - - -class IsInLongSeriesValuesDominate: - params = [ - ["int64", "int32", "float64", "float32", "object"], - ["random", "monotone"], - ] - param_names = ["dtype", "series_type"] - - def setup(self, dtype, series_type): - N = 10 ** 7 - if series_type == "random": - np.random.seed(42) - vals = np.random.randint(0, 10 * N, N) - if series_type == "monotone": - vals = np.arange(N) - self.values = vals.astype(dtype) - M = 10 ** 6 + 1 - self.series = Series(np.arange(M)).astype(dtype) - - def time_isin(self, dtypes, series_type): - self.series.isin(self.values) - - class NSort: params = ["first", "last", "all"] diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c49742095e1d8..b1091ea7f60e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -40,7 +40,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate && \ python -m pip install --no-deps -U pip wheel setuptools && \ pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ - python setup.py build_ext -q -j2 && \ + python setup.py build_ext -q -i -j2 && \ python -m pip install --no-build-isolation -e . && \ pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 8e44db0b4bcd4..3a9bb14470692 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -61,11 +61,6 @@ jobs: PANDAS_TESTING_MODE: "deprecate" EXTRA_APT: "xsel" - py39: - ENV_FILE: ci/deps/azure-39.yaml - CONDA_PY: "39" - PATTERN: "not slow and not network and not clipboard" - steps: - script: | if [ "$(uname)" == "Linux" ]; then diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index e510f4115b25f..601a834d6306a 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -j 4 + python setup.py build_ext -q -i -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/build39.sh b/ci/build39.sh new file mode 100755 index 0000000000000..faef2be03c2bb --- /dev/null +++ b/ci/build39.sh @@ -0,0 +1,12 @@ +#!/bin/bash -e +# Special build for python3.9 until numpy puts its own wheels up + +pip install --no-deps -U pip wheel setuptools +pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis + +python setup.py build_ext -inplace +python -m pip install --no-build-isolation -e . + +python -c "import sys; print(sys.version_info)" +python -c "import pandas as pd" +python -c "import hypothesis" diff --git a/ci/check_cache.sh b/ci/check_cache.sh new file mode 100755 index 0000000000000..b83144fc45ef4 --- /dev/null +++ b/ci/check_cache.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# currently not used +# script to make sure that cache is clean +# Travis CI now handles this + +if [ "$TRAVIS_PULL_REQUEST" == "false" ] +then + echo "Not a PR: checking for changes in ci/ from last 2 commits" + git diff HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) +else + echo "PR: checking for changes in ci/ from last 2 commits" + git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD + git diff PR_HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) +fi + +CACHE_DIR="$HOME/.cache/" +CCACHE_DIR="$HOME/.ccache/" + +if [ $ci_changes -ne 0 ] +then + echo "Files have changed in ci/ deleting all caches" + rm -rf "$CACHE_DIR" + rm -rf "$CCACHE_DIR" +fi diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3eeee61f62a7e..b5a6e32caa8e0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -225,7 +225,7 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS01, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03 RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index f879111a32e67..8ce58e07a8542 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -34,7 +34,7 @@ dependencies: - xlsxwriter - xlwt - moto - - pyarrow=1.0.0 + - pyarrow>=0.15 - pip - pip: - pyxlsb diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml deleted file mode 100644 index c4c84e73fa684..0000000000000 --- a/ci/deps/azure-39.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9.* - - # tools - - cython>=0.29.21 - - pytest>=5.0.1 - - pytest-xdist>=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - numpy - - python-dateutil - - pytz - - # optional dependencies - - pytables - - scipy - - pyarrow=1.0 diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 4e442b10482a7..e93a86910bf34 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -34,7 +34,7 @@ dependencies: - pyarrow>=0.17 - pytables>=3.5.1 - scipy - - xarray=0.12.3 + - xarray=0.12.0 - xlrd - xlsxwriter - xlwt diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 78d24c814840a..9b553fbc81a03 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then # GH#37455 windows py38 build appears to be running out of memory # skip collection of window tests - PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/" + PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/" fi echo $PYTEST_CMD diff --git a/ci/setup_env.sh b/ci/setup_env.sh index c36422884f2ec..247f809c5fe63 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,5 +1,10 @@ #!/bin/bash -e +if [ "$JOB" == "3.9-dev" ]; then + /bin/bash ci/build39.sh + exit 0 +fi + # edit the locale file if needed if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" @@ -108,12 +113,6 @@ fi echo "activate pandas-dev" source activate pandas-dev -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - echo echo "remove any installed pandas package" echo "w/o removing anything else" @@ -137,7 +136,7 @@ conda list pandas # Make sure any error below is reported as such echo "[Build extensions]" -python setup.py build_ext -q -j2 +python setup.py build_ext -q -i -j2 echo "[Updating pip]" python -m pip install --no-deps -U pip wheel setuptools diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 3c5a88333be56..4261d79a5e3f5 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,7 +146,7 @@ Creating a development environment ---------------------------------- To test out code changes, you'll need to build pandas from source, which -requires a C/C++ compiler and Python environment. If you're making documentation +requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. @@ -183,7 +183,7 @@ See https://www.jetbrains.com/help/pycharm/docker.html for details. Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: - python setup.py build_ext -j 4 + python setup.py build_ext --inplace -j 4 .. _contributing.dev_c: @@ -195,13 +195,6 @@ operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which platform you're using. -If you have setup your environment using ``conda``, the packages ``c-compiler`` -and ``cxx-compiler`` will install a fitting compiler for your platform that is -compatible with the remaining conda packages. On Windows and macOS, you will -also need to install the SDKs as they have to be distributed separately. -These packages will be automatically installed by using ``pandas``'s -``environment.yml``. - **Windows** You will need `Build Tools for Visual Studio 2017 @@ -213,33 +206,12 @@ You will need `Build Tools for Visual Studio 2017 scrolling down to "All downloads" -> "Tools for Visual Studio 2019". In the installer, select the "C++ build tools" workload. -You can install the necessary components on the commandline using -`vs_buildtools.exe `_: - -.. code:: - - vs_buildtools.exe --quiet --wait --norestart --nocache ^ - --installPath C:\BuildTools ^ - --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^ - --add Microsoft.VisualStudio.Component.VC.v141 ^ - --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^ - --add Microsoft.VisualStudio.Component.Windows10SDK.17763 - -To setup the right paths on the commandline, call -``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``. - **macOS** -To use the ``conda``-based compilers, you will need to install the -Developer Tools using ``xcode-select --install``. Otherwise -information about compiler installation can be found here: +Information about compiler installation can be found here: https://devguide.python.org/setup/#macos -**Linux** - -For Linux-based ``conda`` installations, you won't have to install any -additional components outside of the conda environment. The instructions -below are only needed if your setup isn't based on conda environments. +**Unix** Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -271,10 +243,11 @@ Let us know if you have any difficulties by opening an issue or reaching out on Creating a Python environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Now create an isolated pandas development environment: +Now that you have a C compiler, create an isolated pandas development +environment: -* Install either `Anaconda `_, `miniconda - `_, or `miniforge `_ +* Install either `Anaconda `_ or `miniconda + `_ * Make sure your conda is up to date (``conda update conda``) * Make sure that you have :ref:`cloned the repository ` * ``cd`` to the pandas source directory @@ -295,7 +268,7 @@ We'll now kick off a three-step process: source activate pandas-dev # Build and install pandas - python setup.py build_ext -j 4 + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 At this point you should be able to import pandas from your locally built version:: @@ -342,7 +315,7 @@ You'll need to have at least Python 3.6.1 installed on your system. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext -j 4 + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Unix**/**macOS with pyenv** @@ -366,7 +339,7 @@ Consult the docs for setting up pyenv `here `__. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext -j 4 + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Windows** @@ -392,7 +365,7 @@ should already exist. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext -j 4 + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 Creating a branch @@ -469,7 +442,7 @@ Some other important things to know about the docs: contributing_docstring.rst -* The tutorials make heavy use of the `IPython directive +* The tutorials make heavy use of the `ipython directive `_ sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example:: diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 623d1e8d45565..26cdd0687706c 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -63,14 +63,14 @@ The first conventions every Python docstring should follow are defined in `PEP-257 `_. As PEP-257 is quite broad, other more specific standards also exist. In the -case of pandas, the NumPy docstring convention is followed. These conventions are +case of pandas, the numpy docstring convention is followed. These conventions are explained in this document: * `numpydoc docstring guide `_ (which is based in the original `Guide to NumPy/SciPy documentation `_) -numpydoc is a Sphinx extension to support the NumPy docstring convention. +numpydoc is a Sphinx extension to support the numpy docstring convention. The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation @@ -401,7 +401,7 @@ DataFrame: * pandas.Categorical * pandas.arrays.SparseArray -If the exact type is not relevant, but must be compatible with a NumPy +If the exact type is not relevant, but must be compatible with a numpy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: @@ -819,7 +819,7 @@ positional arguments ``head(3)``. """ A sample DataFrame method. - Do not import NumPy and pandas. + Do not import numpy and pandas. Try to use meaningful data, when it makes the example easier to understand. @@ -854,7 +854,7 @@ Tips for getting your examples pass the doctests Getting the examples pass the doctests in the validation script can sometimes be tricky. Here are some attention points: -* Import all needed libraries (except for pandas and NumPy, those are already +* Import all needed libraries (except for pandas and numpy, those are already imported as ``import pandas as pd`` and ``import numpy as np``) and define all variables you use in the example. diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index d4219296f5795..77fe930cf21e3 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -219,7 +219,7 @@ and re-boxes it if necessary. If applicable, we highly recommend that you implement ``__array_ufunc__`` in your extension array to avoid coercion to an ndarray. See -`the NumPy documentation `__ +`the numpy documentation `__ for an example. As part of your implementation, we require that you defer to pandas when a pandas diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index e842c827b417f..f8a6bb6deb52d 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -16,7 +16,6 @@ Development code_style maintaining internals - test_writing extending developer policies diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index f8e6bda2085d8..ced5b686b8246 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -35,7 +35,7 @@ We will not introduce new deprecations in patch releases. Deprecations will only be enforced in **major** releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the -deprecation removed in the next major release (2.0.0). +deprecation removed in the next next major release (2.0.0). .. note:: diff --git a/doc/source/development/test_writing.rst b/doc/source/development/test_writing.rst deleted file mode 100644 index d9e24bb76eed8..0000000000000 --- a/doc/source/development/test_writing.rst +++ /dev/null @@ -1,174 +0,0 @@ -.. _test_organization: - -Test organization -================= -Ideally, there should be one, and only one, obvious place for a test to reside. -Until we reach that ideal, these are some rules of thumb for where a test should -be located. - -1. Does your test depend only on code in ``pd._libs.tslibs``? - This test likely belongs in one of: - - - tests.tslibs - - .. note:: - - No file in ``tests.tslibs`` should import from any pandas modules - outside of ``pd._libs.tslibs`` - - - tests.scalar - - tests.tseries.offsets - -2. Does your test depend only on code in pd._libs? - This test likely belongs in one of: - - - tests.libs - - tests.groupby.test_libgroupby - -3. Is your test for an arithmetic or comparison method? - This test likely belongs in one of: - - - tests.arithmetic - - .. note:: - - These are intended for tests that can be shared to test the behavior - of DataFrame/Series/Index/ExtensionArray using the ``box_with_array`` - fixture. - - - tests.frame.test_arithmetic - - tests.series.test_arithmetic - -4. Is your test for a reduction method (min, max, sum, prod, ...)? - This test likely belongs in one of: - - - tests.reductions - - .. note:: - - These are intended for tests that can be shared to test the behavior - of DataFrame/Series/Index/ExtensionArray. - - - tests.frame.test_reductions - - tests.series.test_reductions - - tests.test_nanops - -5. Is your test for an indexing method? - This is the most difficult case for deciding where a test belongs, because - there are many of these tests, and many of them test more than one method - (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``) - - A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``, - ``Index.get_indexer``)? - This test likely belongs in one of: - - - tests.indexes.test_indexing - - tests.indexes.fooindex.test_indexing - - Within that files there should be a method-specific test class e.g. - ``TestGetLoc``. - - In most cases, neither ``Series`` nor ``DataFrame`` objects should be - needed in these tests. - - B) Is the test for a Series or DataFrame indexing method *other* than - ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``, - ``mask``, ``lookup``, or ``insert``? - This test likely belongs in one of: - - - tests.frame.indexing.test_methodname - - tests.series.indexing.test_methodname - - C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``? - This test likely belongs in one of: - - - tests.indexing.test_loc - - tests.indexing.test_iloc - - tests.indexing.test_at - - tests.indexing.test_iat - - Within the appropriate file, test classes correspond to either types of - indexers (e.g. ``TestLocBooleanMask``) or major use cases - (e.g. ``TestLocSetitemWithExpansion``). - - See the note in section D) about tests that test multiple indexing methods. - - D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``, - ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``? - This test likely belongs in one of: - - - tests.series.test_getitem - - tests.series.test_setitem - - tests.frame.test_getitem - - tests.frame.test_setitem - - If many cases such a test may test multiple similar methods, e.g. - - .. code-block:: python - - import pandas as pd - import pandas._testing as tm - - def test_getitem_listlike_of_ints(): - ser = pd.Series(range(5)) - - result = ser[[3, 4]] - expected = pd.Series([2, 3]) - tm.assert_series_equal(result, expected) - - result = ser.loc[[3, 4]] - tm.assert_series_equal(result, expected) - - In cases like this, the test location should be based on the *underlying* - method being tested. Or in the case of a test for a bugfix, the location - of the actual bug. So in this example, we know that ``Series.__getitem__`` - calls ``Series.loc.__getitem__``, so this is *really* a test for - ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. - -6. Is your test for a DataFrame or Series method? - - A) Is the method a plotting method? - This test likely belongs in one of: - - - tests.plotting - - B) Is the method an IO method? - This test likely belongs in one of: - - - tests.io - - C) Otherwise - This test likely belongs in one of: - - - tests.series.methods.test_mymethod - - tests.frame.methods.test_mymethod - - .. note:: - - If a test can be shared between DataFrame/Series using the - ``frame_or_series`` fixture, by convention it goes in the - ``tests.frame`` file. - - - tests.generic.methods.test_mymethod - - .. note:: - - The generic/methods/ directory is only for methods with tests - that are fully parametrized over Series/DataFrame - -7. Is your test for an Index method, not depending on Series/DataFrame? - This test likely belongs in one of: - - - tests.indexes - -8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, - ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, - ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? - This test likely belongs in one of: - - - tests.arrays - -9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")? - This test likely belongs in one of: - - - tests.extension diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e88875a9f679c..670905f6587bc 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -174,20 +174,10 @@ invoked with the following command dtale.show(df) -D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle +D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle & Google Colab. Here are some demos of the `grid `__ and `chart-builder `__. -`hvplot `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. -It can be loaded as a native pandas plotting backend via - -.. code:: python - - pd.set_option("plotting.backend", "hvplot") - .. _ecosystem.ide: IDE @@ -431,7 +421,7 @@ If also displays progress bars. `Vaex `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). +Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). * vaex.from_pandas * vaex.to_pandas_df diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c823ad01f10bf..df481e8c986f7 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -284,7 +284,7 @@ pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.12.3 pandas-like API for N-dimensional data +xarray 0.12.0 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.2.0 Excel reading xlwt 1.3.0 Excel writing diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index b7a566a35084d..991c2bbe0fba6 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -131,8 +131,8 @@ standard Python to get an overview of the available plot methods: ] .. note:: - In many development environments as well as IPython and - Jupyter Notebook, use the TAB button to get an overview of the available + In many development environments as well as ipython and + jupyter notebook, use the TAB button to get an overview of the available methods, for example ``air_quality.plot.`` + TAB. One of the options is :meth:`DataFrame.plot.box`, which refers to a diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index f7c5eaf242b34..9d5649c37e92f 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -30,6 +30,7 @@ public functions related to data types in pandas. series frame arrays + panel indexing offset_frequency window diff --git a/doc/source/reference/panel.rst b/doc/source/reference/panel.rst new file mode 100644 index 0000000000000..37d48c2dadf2e --- /dev/null +++ b/doc/source/reference/panel.rst @@ -0,0 +1,10 @@ +{{ header }} + +.. _api.panel: + +===== +Panel +===== +.. currentmodule:: pandas + +``Panel`` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_ diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index e80dc1b57ff80..24a47336b0522 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -36,7 +36,6 @@ Style application Styler.where Styler.format Styler.set_precision - Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes Styler.set_caption diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index a255b3ae8081e..77697b966df18 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -10,10 +10,8 @@ Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.roll Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc. ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc. -.. _api.functions_rolling: - -Rolling window functions ------------------------- +Standard moving window functions +-------------------------------- .. currentmodule:: pandas.core.window.rolling .. autosummary:: @@ -35,16 +33,6 @@ Rolling window functions Rolling.aggregate Rolling.quantile Rolling.sem - -.. _api.functions_window: - -Weighted window functions -------------------------- -.. currentmodule:: pandas.core.window.rolling - -.. autosummary:: - :toctree: api/ - Window.mean Window.sum Window.var @@ -52,8 +40,8 @@ Weighted window functions .. _api.functions_expanding: -Expanding window functions --------------------------- +Standard expanding window functions +----------------------------------- .. currentmodule:: pandas.core.window.expanding .. autosummary:: @@ -76,10 +64,8 @@ Expanding window functions Expanding.quantile Expanding.sem -.. _api.functions_ewm: - -Exponentially-weighted window functions ---------------------------------------- +Exponentially-weighted moving window functions +---------------------------------------------- .. currentmodule:: pandas.core.window.ewm .. autosummary:: @@ -91,8 +77,6 @@ Exponentially-weighted window functions ExponentialMovingWindow.corr ExponentialMovingWindow.cov -.. _api.indexers_window: - Window indexer -------------- .. currentmodule:: pandas diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index cf548ba5d1133..08f83a4674ada 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -239,13 +239,13 @@ Select via the position of the passed integers: df.iloc[3] -By integer slices, acting similar to numpy/Python: +By integer slices, acting similar to numpy/python: .. ipython:: python df.iloc[3:5, 0:2] -By lists of integer position locations, similar to the NumPy/Python style: +By lists of integer position locations, similar to the numpy/python style: .. ipython:: python diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index ffecaa222e1f9..53fabf94e24e0 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -538,8 +538,8 @@ standard deviation of 1), very concisely: Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` preserve the location of ``NaN`` values. This is somewhat different from -:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling` since ``NaN`` behavior -is furthermore dictated by a ``min_periods`` parameter. +:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`. +For more details please see :ref:`this note `. .. ipython:: python @@ -845,7 +845,7 @@ For example, we can fit a regression using statsmodels. Their API expects a form The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. -The implementation of ``pipe`` here is quite clean and feels right at home in Python. +The implementation of ``pipe`` here is quite clean and feels right at home in python. We encourage you to view the source code of :meth:`~DataFrame.pipe`. .. _dplyr: https://github.com/hadley/dplyr @@ -945,7 +945,7 @@ Aggregation API The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. This API is similar across pandas objects, see :ref:`groupby API `, the -:ref:`window API `, and the :ref:`resample API `. +:ref:`window functions API `, and the :ref:`resample API `. The entry point for aggregation is :meth:`DataFrame.aggregate`, or the alias :meth:`DataFrame.agg`. @@ -2203,7 +2203,7 @@ You can use the :meth:`~DataFrame.astype` method to explicitly convert dtypes fr even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an exception if the astype operation is invalid. -Upcasting is always according to the **NumPy** rules. If two different dtypes are involved in an operation, +Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, then the more *general* one will be used as the result of the operation. .. ipython:: python diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 17d1809638d61..45d15f29fcce8 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -206,9 +206,990 @@ parameter: - ``max`` : highest rank in the group - ``first`` : ranks assigned in the order they appear in the array -.. _computation.windowing: +.. _stats.moments: -Windowing functions -~~~~~~~~~~~~~~~~~~~ +Window functions +---------------- -See :ref:`the window operations user guide ` for an overview of windowing functions. +.. currentmodule:: pandas.core.window + +For working with data, a number of window functions are provided for +computing common *window* or *rolling* statistics. Among these are count, sum, +mean, median, correlation, variance, covariance, standard deviation, skewness, +and kurtosis. + +The ``rolling()`` and ``expanding()`` +functions can be used directly from DataFrameGroupBy objects, +see the :ref:`groupby docs `. + + +.. note:: + + The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here `. + +.. warning:: + + When using ``rolling()`` and an associated function the results are calculated with rolling sums. As a consequence + when having values differing with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be + noted, that large values may have an impact on windows, which do not include these values. `Kahan summation + `__ is used + to compute the rolling sums to preserve accuracy as much as possible. The same holds true for ``Rolling.var()`` for + values differing with magnitude :math:`(1/np.finfo(np.double).eps)^{0.5}`. + +We work with ``rolling``, ``expanding`` and ``exponentially weighted`` data through the corresponding +objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.ExponentialMovingWindow`. + +.. ipython:: python + + s = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) + s = s.cumsum() + s + +These are created from methods on ``Series`` and ``DataFrame``. + +.. ipython:: python + + r = s.rolling(window=60) + r + +These object provide tab-completion of the available methods and properties. + +.. code-block:: ipython + + In [14]: r. # noqa: E225, E999 + r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum + r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var + +Generally these methods all have the same interface. They all +accept the following arguments: + +- ``window``: size of moving window +- ``min_periods``: threshold of non-null data points to require (otherwise + result is NA) +- ``center``: boolean, whether to set the labels at the center (default is False) + +We can then call methods on these ``rolling`` objects. These return like-indexed objects: + +.. ipython:: python + + r.mean() + +.. ipython:: python + + s.plot(style="k--") + + @savefig rolling_mean_ex.png + r.mean().plot(style="k") + +.. ipython:: python + :suppress: + + plt.close("all") + +They can also be applied to DataFrame objects. This is really just syntactic +sugar for applying the moving window operator to all of the DataFrame's columns: + +.. ipython:: python + + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) + df = df.cumsum() + + @savefig rolling_mean_frame.png + df.rolling(window=60).sum().plot(subplots=True) + +.. _stats.summary: + +Method summary +~~~~~~~~~~~~~~ + +We provide a number of common statistical functions: + +.. currentmodule:: pandas.core.window + +.. csv-table:: + :header: "Method", "Description" + :widths: 20, 80 + + :meth:`~Rolling.count`, Number of non-null observations + :meth:`~Rolling.sum`, Sum of values + :meth:`~Rolling.mean`, Mean of values + :meth:`~Rolling.median`, Arithmetic median of values + :meth:`~Rolling.min`, Minimum + :meth:`~Rolling.max`, Maximum + :meth:`~Rolling.std`, Sample standard deviation + :meth:`~Rolling.var`, Sample variance + :meth:`~Rolling.skew`, Sample skewness (3rd moment) + :meth:`~Rolling.kurt`, Sample kurtosis (4th moment) + :meth:`~Rolling.quantile`, Sample quantile (value at %) + :meth:`~Rolling.apply`, Generic apply + :meth:`~Rolling.cov`, Sample covariance (binary) + :meth:`~Rolling.corr`, Sample correlation (binary) + :meth:`~Rolling.sem`, Standard error of mean + +.. _computation.window_variance.caveats: + +.. note:: + + Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample + variance formula by default, i.e. the sum of squared differences is divided by + ``window_size - 1`` and not by ``window_size`` during averaging. In statistics, + we use sample when the dataset is drawn from a larger population that we + don't have access to. Using it implies that the data in our window is a + random sample from the population, and we are interested not in the variance + inside the specific window but in the variance of some general window that + our windows represent. In this situation, using the sample variance formula + results in an unbiased estimator and so is preferred. + + Usually, we are instead interested in the variance of each window as we slide + it over the data, and in this case we should specify ``ddof=0`` when calling + these methods to use population variance instead of sample variance. Using + sample variance under the circumstances would result in a biased estimator + of the variable we are trying to determine. + + The same caveats apply to using any supported statistical sample methods. + +.. _stats.rolling_apply: + +Rolling apply +~~~~~~~~~~~~~ + +The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs +generic rolling computations. The ``func`` argument should be a single function +that produces a single value from an ndarray input. Suppose we wanted to +compute the mean absolute deviation on a rolling basis: + +.. ipython:: python + + def mad(x): + return np.fabs(x - x.mean()).mean() + + @savefig rolling_apply_ex.png + s.rolling(window=60).apply(mad, raw=True).plot(style="k") + +Using the Numba engine +~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +1. If ``func`` is a standard Python function, the engine will `JIT `__ +the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. + +2. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. + +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, the compiled functions are cached, + and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +.. _stats.rolling_window: + +Rolling windows +~~~~~~~~~~~~~~~ + +Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``. +The following methods are available: + +.. csv-table:: + :header: "Method", "Description" + :widths: 20, 80 + + :meth:`~Window.sum`, Sum of values + :meth:`~Window.mean`, Mean of values + +The weights used in the window are specified by the ``win_type`` keyword. +The list of recognized types are the `scipy.signal window functions +`__: + +* ``boxcar`` +* ``triang`` +* ``blackman`` +* ``hamming`` +* ``bartlett`` +* ``parzen`` +* ``bohman`` +* ``blackmanharris`` +* ``nuttall`` +* ``barthann`` +* ``kaiser`` (needs beta) +* ``gaussian`` (needs std) +* ``general_gaussian`` (needs power, width) +* ``slepian`` (needs width) +* ``exponential`` (needs tau). + +.. versionadded:: 1.2.0 + +All Scipy window types, concurrent with your installed version, are recognized ``win_types``. + +.. ipython:: python + + ser = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10)) + + ser.rolling(window=5, win_type="triang").mean() + +Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. + +.. ipython:: python + + ser.rolling(window=5, win_type="boxcar").mean() + ser.rolling(window=5).mean() + +For some windowing functions, additional parameters must be specified: + +.. ipython:: python + + ser.rolling(window=5, win_type="gaussian").mean(std=0.1) + +.. _stats.moments.normalization: + +.. note:: + + For ``.sum()`` with a ``win_type``, there is no normalization done to the + weights for the window. Passing custom weights of ``[1, 1, 1]`` will yield a different + result than passing weights of ``[2, 2, 2]``, for example. When passing a + ``win_type`` instead of explicitly specifying the weights, the weights are + already normalized so that the largest weight is 1. + + In contrast, the nature of the ``.mean()`` calculation is + such that the weights are normalized with respect to each other. Weights + of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. + +.. _stats.moments.ts: + +Time-aware rolling +~~~~~~~~~~~~~~~~~~ + +It is possible to pass an offset (or convertible) to a ``.rolling()`` method and have it produce +variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring +within the indicated time delta. + +This can be particularly useful for a non-regular time frequency index. + +.. ipython:: python + + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling("2s").sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + + +.. ipython:: python + + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) + dft + dft.rolling(2).sum() + + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling("2s").sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling("2s", on="foo").sum() + +.. _stats.custom_rolling_window: + +Custom window rolling +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts +a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. +The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns +a tuple of two arrays, the first being the starting indices of the windows and second being the +ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` +and will automatically be passed to ``get_window_bounds`` and the defined method must +always accept these arguments. + +For example, if we have the following ``DataFrame``: + +.. ipython:: python + + use_expanding = [True, False, True, False, True] + use_expanding + df = pd.DataFrame({"values": range(5)}) + df + +and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size +1, we can create the following ``BaseIndexer`` subclass: + +.. code-block:: ipython + + In [2]: from pandas.api.indexers import BaseIndexer + ...: + ...: class CustomIndexer(BaseIndexer): + ...: + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + ...: + + In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [4]: df.rolling(indexer).sum() + Out[4]: + values + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 10.0 + +You can view other examples of ``BaseIndexer`` subclasses `here `__ + +.. versionadded:: 1.1 + +One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows +rolling operations over a non-fixed offset like a ``BusinessDay``. + +.. ipython:: python + + from pandas.api.indexers import VariableOffsetWindowIndexer + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) + offset = pd.offsets.BDay(1) + indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) + df + df.rolling(indexer).sum() + +For some problems knowledge of the future is available for analysis. For example, this occurs when +each data point is a full time series read from an experiment, and the task is to extract underlying +conditions. In these cases it can be useful to perform forward-looking rolling window computations. +:func:`FixedForwardWindowIndexer ` class is available for this purpose. +This :func:`BaseIndexer ` subclass implements a closed fixed-width +forward-looking rolling window, and we can use it as follows: + +.. ipython:: ipython + + from pandas.api.indexers import FixedForwardWindowIndexer + indexer = FixedForwardWindowIndexer(window_size=2) + df.rolling(indexer, min_periods=1).sum() + +.. _stats.rolling_window.endpoints: + +Rolling window endpoints +~~~~~~~~~~~~~~~~~~~~~~~~ + +The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed`` +parameter: + +.. csv-table:: + :header: "``closed``", "Description", "Default for" + :widths: 20, 30, 30 + + ``right``, close right endpoint, + ``left``, close left endpoint, + ``both``, close both endpoints, + ``neither``, open endpoints, + +For example, having the right endpoint open is useful in many problems that require that there is no contamination +from present information back to past information. This allows the rolling window to compute statistics +"up to that point in time", but not including that point in time. + +.. ipython:: python + + df = pd.DataFrame( + {"x": 1}, + index=[ + pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06"), + ], + ) + + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() + + df + +.. _stats.iter_rolling_window: + +Iteration over window: +~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +``Rolling`` and ``Expanding`` objects now support iteration. Be noted that ``min_periods`` is ignored in iteration. + +.. ipython:: + + In [1]: df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + In [2]: for i in df.rolling(2): + ...: print(i) + ...: + + +.. _stats.moments.ts-versus-resampling: + +Time-aware rolling vs. resampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They +both operate and perform reductive operations on time-indexed pandas objects. + +When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and +aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value +at that point in the result. These are variable sized windows in time-space for each point of the input. You will get +a same sized result as the input. + +When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency +bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this +aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result +will have the shape of a regular frequency between the min and the max of the original input object. + +To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. + +Centering windows +~~~~~~~~~~~~~~~~~ + +By default the labels are set to the right edge of the window, but a +``center`` keyword is available so the labels can be set at the center. + +.. ipython:: python + + ser.rolling(window=5).mean() + ser.rolling(window=5, center=True).mean() + +.. _stats.moments.binary: + +Binary window functions +~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about +two ``Series`` or any combination of ``DataFrame/Series`` or +``DataFrame/DataFrame``. Here is the behavior in each case: + +* two ``Series``: compute the statistic for the pairing. +* ``DataFrame/Series``: compute the statistics for each column of the DataFrame + with the passed Series, thus returning a DataFrame. +* ``DataFrame/DataFrame``: by default compute the statistic for matching column + names, returning a DataFrame. If the keyword argument ``pairwise=True`` is + passed then computes the statistic for each pair of columns, returning a + ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section + `). + +For example: + +.. ipython:: python + + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) + df = df.cumsum() + + df2 = df[:20] + df2.rolling(window=5).corr(df2["B"]) + +.. _stats.moments.corr_pairwise: + +Computing rolling pairwise covariances and correlations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In financial data analysis and other fields it's common to compute covariance +and correlation matrices for a collection of time series. Often one is also +interested in moving-window covariance and correlation matrices. This can be +done by passing the ``pairwise`` keyword argument, which in the case of +``DataFrame`` inputs will yield a MultiIndexed ``DataFrame`` whose ``index`` are the dates in +question. In the case of a single DataFrame argument the ``pairwise`` argument +can even be omitted: + +.. note:: + + Missing values are ignored and each entry is computed using the pairwise + complete observations. Please see the :ref:`covariance section + ` for :ref:`caveats + ` associated with this method of + calculating covariance and correlation matrices. + +.. ipython:: python + + covs = ( + df[["B", "C", "D"]] + .rolling(window=50) + .cov(df[["A", "B", "C"]], pairwise=True) + ) + covs.loc["2002-09-22":] + +.. ipython:: python + + correls = df.rolling(window=50).corr() + correls.loc["2002-09-22":] + +You can efficiently retrieve the time series of correlations between two +columns by reshaping and indexing: + +.. ipython:: python + :suppress: + + plt.close("all") + +.. ipython:: python + + @savefig rolling_corr_pairwise_ex.png + correls.unstack(1)[("A", "C")].plot() + +.. _stats.aggregate: + +Aggregation +----------- + +Once the ``Rolling``, ``Expanding`` or ``ExponentialMovingWindow`` objects have been created, several methods are available to +perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `, +:ref:`groupby API `, and :ref:`resample API `. + + +.. ipython:: python + + dfa = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) + r = dfa.rolling(window=60, min_periods=1) + r + +We can aggregate by passing a function to the entire DataFrame, or select a +Series (or multiple Series) via standard ``__getitem__``. + +.. ipython:: python + + r.aggregate(np.sum) + + r["A"].aggregate(np.sum) + + r[["A", "B"]].aggregate(np.sum) + +As you can see, the result of the aggregation will have the selected columns, or all +columns if none are selected. + +.. _stats.aggregate.multifunc: + +Applying multiple functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With windowed ``Series`` you can also pass a list of functions to do +aggregation with, outputting a DataFrame: + +.. ipython:: python + + r["A"].agg([np.sum, np.mean, np.std]) + +On a windowed DataFrame, you can pass a list of functions to apply to each +column, which produces an aggregated result with a hierarchical index: + +.. ipython:: python + + r.agg([np.sum, np.mean]) + +Passing a dict of functions has different behavior by default, see the next +section. + +Applying different functions to DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By passing a dict to ``aggregate`` you can apply a different aggregation to the +columns of a ``DataFrame``: + +.. ipython:: python + + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + +The function names can also be strings. In order for a string to be valid it +must be implemented on the windowed object + +.. ipython:: python + + r.agg({"A": "sum", "B": "std"}) + +Furthermore you can pass a nested dict to indicate different aggregations on different columns. + +.. ipython:: python + + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + + +.. _stats.moments.expanding: + +Expanding windows +----------------- + +A common alternative to rolling statistics is to use an *expanding* window, +which yields the value of the statistic with all the data available up to that +point in time. + +These follow a similar interface to ``.rolling``, with the ``.expanding`` method +returning an :class:`~pandas.core.window.Expanding` object. + +As these calculations are a special case of rolling statistics, +they are implemented in pandas such that the following two calls are equivalent: + +.. ipython:: python + + df.rolling(window=len(df), min_periods=1).mean()[:5] + + df.expanding(min_periods=1).mean()[:5] + +These have a similar set of methods to ``.rolling`` methods. + +Method summary +~~~~~~~~~~~~~~ + +.. currentmodule:: pandas.core.window + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + :meth:`~Expanding.count`, Number of non-null observations + :meth:`~Expanding.sum`, Sum of values + :meth:`~Expanding.mean`, Mean of values + :meth:`~Expanding.median`, Arithmetic median of values + :meth:`~Expanding.min`, Minimum + :meth:`~Expanding.max`, Maximum + :meth:`~Expanding.std`, Sample standard deviation + :meth:`~Expanding.var`, Sample variance + :meth:`~Expanding.skew`, Sample skewness (3rd moment) + :meth:`~Expanding.kurt`, Sample kurtosis (4th moment) + :meth:`~Expanding.quantile`, Sample quantile (value at %) + :meth:`~Expanding.apply`, Generic apply + :meth:`~Expanding.cov`, Sample covariance (binary) + :meth:`~Expanding.corr`, Sample correlation (binary) + :meth:`~Expanding.sem`, Standard error of mean + +.. note:: + + Using sample variance formulas for :meth:`~Expanding.std` and + :meth:`~Expanding.var` comes with the same caveats as using them with rolling + windows. See :ref:`this section ` for more + information. + + The same caveats apply to using any supported statistical sample methods. + +.. currentmodule:: pandas + +Aside from not having a ``window`` parameter, these functions have the same +interfaces as their ``.rolling`` counterparts. Like above, the parameters they +all accept are: + +* ``min_periods``: threshold of non-null data points to require. Defaults to + minimum needed to compute statistic. No ``NaNs`` will be output once + ``min_periods`` non-null data points have been seen. +* ``center``: boolean, whether to set the labels at the center (default is False). + +.. _stats.moments.expanding.note: +.. note:: + + The output of the ``.rolling`` and ``.expanding`` methods do not return a + ``NaN`` if there are at least ``min_periods`` non-null values in the current + window. For example: + + .. ipython:: python + + sn = pd.Series([1, 2, np.nan, 3, np.nan, 4]) + sn + sn.rolling(2).max() + sn.rolling(2, min_periods=1).max() + + In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`, + :meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`, + and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever + a ``NaN`` is encountered in the input. In order to match the output of ``cumsum`` + with ``expanding``, use :meth:`~DataFrame.fillna`: + + .. ipython:: python + + sn.expanding().sum() + sn.cumsum() + sn.cumsum().fillna(method="ffill") + + +An expanding window statistic will be more stable (and less responsive) than +its rolling window counterpart as the increasing window size decreases the +relative impact of an individual data point. As an example, here is the +:meth:`~core.window.Expanding.mean` output for the previous time series dataset: + +.. ipython:: python + :suppress: + + plt.close("all") + +.. ipython:: python + + s.plot(style="k--") + + @savefig expanding_mean_frame.png + s.expanding().mean().plot(style="k") + + +.. _stats.moments.exponentially_weighted: + +Exponentially weighted windows +------------------------------ + +.. currentmodule:: pandas.core.window + +A related set of functions are exponentially weighted versions of several of +the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed +through the ``.ewm`` method to receive an :class:`~ExponentialMovingWindow` object. +A number of expanding EW (exponentially weighted) +methods are provided: + + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + :meth:`~ExponentialMovingWindow.mean`, EW moving average + :meth:`~ExponentialMovingWindow.var`, EW moving variance + :meth:`~ExponentialMovingWindow.std`, EW moving standard deviation + :meth:`~ExponentialMovingWindow.corr`, EW moving correlation + :meth:`~ExponentialMovingWindow.cov`, EW moving covariance + +In general, a weighted moving average is calculated as + +.. math:: + + y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i}, + +where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i` +are the weights. + +The EW functions support two variants of exponential weights. +The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i` +which gives + +.. math:: + + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + + (1 - \alpha)^t} + +When ``adjust=False`` is specified, moving averages are calculated as + +.. math:: + + y_0 &= x_0 \\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + +which is equivalent to using weights + +.. math:: + + w_i = \begin{cases} + \alpha (1 - \alpha)^i & \text{if } i < t \\ + (1 - \alpha)^i & \text{if } i = t. + \end{cases} + +.. note:: + + These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g. + + .. math:: + + y_t = \alpha' y_{t-1} + (1 - \alpha') x_t. + +The difference between the above two variants arises because we are +dealing with series which have finite history. Consider a series of infinite +history, with ``adjust=True``: + +.. math:: + + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} + {1 + (1 - \alpha) + (1 - \alpha)^2 + ...} + +Noting that the denominator is a geometric series with initial term equal to 1 +and a ratio of :math:`1 - \alpha` we have + +.. math:: + + y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} + {\frac{1}{1 - (1 - \alpha)}}\\ + &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\ + &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\ + &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\ + &= \alpha x_t + (1 - \alpha) y_{t-1} + +which is the same expression as ``adjust=False`` above and therefore +shows the equivalence of the two variants for infinite series. +When ``adjust=False``, we have :math:`y_0 = x_0` and +:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`. +Therefore, there is an assumption that :math:`x_0` is not an ordinary value +but rather an exponentially weighted moment of the infinite series up to that +point. + +One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass +:math:`\alpha` directly, it's often easier to think about either the +**span**, **center of mass (com)** or **half-life** of an EW moment: + +.. math:: + + \alpha = + \begin{cases} + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \end{cases} + +One must specify precisely one of **span**, **center of mass**, **half-life** +and **alpha** to the EW functions: + +* **Span** corresponds to what is commonly called an "N-day EW moving average". +* **Center of mass** has a more physical interpretation and can be thought of + in terms of span: :math:`c = (s - 1) / 2`. +* **Half-life** is the period of time for the exponential weight to reduce to + one half. +* **Alpha** specifies the smoothing factor directly. + +.. versionadded:: 1.1.0 + +You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of +time it takes for an observation to decay to half its value when also specifying a sequence +of ``times``. + +.. ipython:: python + + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() + +The following formula is used to compute exponentially weighted mean with an input vector of times: + +.. math:: + + y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}}, + +Here is an example for a univariate time series: + +.. ipython:: python + + s.plot(style="k--") + + @savefig ewma_ex.png + s.ewm(span=20).mean().plot(style="k") + +ExponentialMovingWindow has a ``min_periods`` argument, which has the same +meaning it does for all the ``.expanding`` and ``.rolling`` methods: +no output values will be set until at least ``min_periods`` non-null values +are encountered in the (expanding) window. + +ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how +intermediate null values affect the calculation of the weights. +When ``ignore_na=False`` (the default), weights are calculated based on absolute +positions, so that intermediate null values affect the result. +When ``ignore_na=True``, +weights are calculated by ignoring intermediate null values. +For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted +average of ``3, NaN, 5`` would be calculated as + +.. math:: + + \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}. + +Whereas if ``ignore_na=True``, the weighted average would be calculated as + +.. math:: + + \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}. + +The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument, +specifying whether the result should contain biased or unbiased statistics. +For example, if ``bias=True``, ``ewmvar(x)`` is calculated as +``ewmvar(x) = ewma(x**2) - ewma(x)**2``; +whereas if ``bias=False`` (the default), the biased variance statistics +are scaled by debiasing factors + +.. math:: + + \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}. + +(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, +with :math:`N = t + 1`.) +See `Weighted Sample Variance `__ +on Wikipedia for further details. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 5a6f56388dee5..939acf10d6c0b 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -18,6 +18,9 @@ above what the in-line examples offer. pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept explicitly imported for newer users. +These examples are written for Python 3. Minor tweaks might be necessary for earlier python +versions. + Idioms ------ @@ -68,7 +71,7 @@ Or use pandas where after you've set up a mask ) df.where(df_mask, -1000) -`if-then-else using NumPy's where() +`if-then-else using numpy's where() `__ .. ipython:: python @@ -1010,7 +1013,7 @@ The :ref:`Plotting ` docs. `Setting x-axis major and minor labels `__ -`Plotting multiple charts in an IPython Jupyter notebook +`Plotting multiple charts in an ipython notebook `__ `Creating a multi-line plot diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index f2bb99dd2ebc0..905877cca61db 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -439,7 +439,7 @@ Data Classes as introduced in `PEP557 can be passed into the DataFrame constructor. Passing a list of dataclasses is equivalent to passing a list of dictionaries. -Please be aware, that all values in the list should be dataclasses, mixing +Please be aware, that that all values in the list should be dataclasses, mixing types in the list would result in a TypeError. .. ipython:: python diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 42621c032416d..cc8de98165fac 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -96,7 +96,7 @@ hence we'll concentrate our efforts cythonizing these two functions. Plain Cython ~~~~~~~~~~~~ -First we're going to need to import the Cython magic function to IPython: +First we're going to need to import the Cython magic function to ipython: .. ipython:: python :okwarning: @@ -123,7 +123,7 @@ is here to distinguish between function versions): .. note:: If you're having trouble pasting the above into your ipython, you may need - to be using bleeding edge IPython for paste to play well with cell magics. + to be using bleeding edge ipython for paste to play well with cell magics. .. code-block:: ipython @@ -160,7 +160,7 @@ We get another huge improvement simply by providing type information: In [4]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1) 10 loops, best of 3: 20.3 ms per loop -Now, we're talking! It's now over ten times faster than the original Python +Now, we're talking! It's now over ten times faster than the original python implementation, and we haven't *really* modified the code. Let's have another look at what's eating up time: @@ -375,7 +375,7 @@ Numba as an argument Additionally, we can leverage the power of `Numba `__ by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools -` for an extensive example. +` for an extensive example. Vectorize ~~~~~~~~~ diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index d6081155b58db..e8866daa9d99f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -478,7 +478,7 @@ Aggregation Once the GroupBy object has been created, several methods are available to perform a computation on the grouped data. These operations are similar to the -:ref:`aggregating API `, :ref:`window API `, +:ref:`aggregating API `, :ref:`window functions API `, and :ref:`resample API `. An obvious one is aggregation via the @@ -524,15 +524,6 @@ index are the group names and whose values are the sizes of each group. grouped.describe() -Another aggregation example is to compute the number of unique values of each group. This is similar to the ``value_counts`` function, except that it only counts unique values. - -.. ipython:: python - - ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]] - df4 = pd.DataFrame(ll, columns=["A", "B"]) - df4 - df4.groupby("A")["B"].nunique() - .. note:: Aggregation functions **will not** return the groups that you are aggregating over @@ -681,7 +672,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", ) -If your desired output column names are not valid Python keywords, construct a dictionary +If your desired output column names are not valid python keywords, construct a dictionary and unpack the keyword arguments .. ipython:: python @@ -1099,7 +1090,7 @@ will be passed into ``values``, and the group index will be passed into ``index` .. warning:: When using ``engine='numba'``, there will be no "fall back" behavior internally. The group - data and group index will be passed as NumPy arrays to the JITed user defined function, and no + data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. .. note:: diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 901f42097b911..2fc9e066e6712 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -40,7 +40,6 @@ Further information on any specific method can be obtained in the visualization computation groupby - window timeseries timedeltas style diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 817ea3445f995..98c981539d207 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -55,7 +55,7 @@ of multi-axis indexing. *label* of the index. This use is **not** an integer position along the index.). * A list or array of labels ``['a', 'b', 'c']``. - * A slice object with labels ``'a':'f'`` (Note that contrary to usual Python + * A slice object with labels ``'a':'f'`` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels ` and :ref:`Endpoints are inclusive `.) @@ -327,7 +327,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp * A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.). * A list or array of labels ``['a', 'b', 'c']``. -* A slice object with labels ``'a':'f'`` (Note that contrary to usual Python +* A slice object with labels ``'a':'f'`` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels `. * A boolean array. @@ -509,11 +509,11 @@ For getting a cross section using an integer position (equiv to ``df.xs(1)``): df1.iloc[1] -Out of range slice indexes are handled gracefully just as in Python/NumPy. +Out of range slice indexes are handled gracefully just as in Python/Numpy. .. ipython:: python - # these are allowed in Python/NumPy. + # these are allowed in python/numpy. x = list('abcdef') x x[4:10] @@ -584,20 +584,48 @@ without using a temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df['r'] > 100]) +.. _indexing.deprecate_ix: -.. _combining_positional_and_label_based_indexing: +IX indexer is deprecated +------------------------ + +.. warning:: + + .. versionchanged:: 1.0.0 + + The ``.ix`` indexer was removed, in favor of the more strict ``.iloc`` and ``.loc`` indexers. -Combining positional and label-based indexing ---------------------------------------------- +``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide +to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a +bit of user confusion over the years. -If you wish to get the 0th and the 2nd elements from the index in the 'A' column, you can do: +The recommended methods of indexing are: + +* ``.loc`` if you want to *label* index. +* ``.iloc`` if you want to *positionally* index. .. ipython:: python dfd = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc')) + dfd + +Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. + +.. code-block:: ipython + + In [3]: dfd.ix[[0, 2], 'A'] + Out[3]: + a 1 + c 3 + Name: A, dtype: int64 + +Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. + +.. ipython:: python + dfd.loc[dfd.index[[0, 2]], 'A'] This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using @@ -1130,40 +1158,6 @@ Mask s.mask(s >= 0) df.mask(df >= 0) -.. _indexing.np_where: - -Setting with enlargement conditionally using :func:`numpy` ----------------------------------------------------------- - -An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`. -Combined with setting a new column, you can use it to enlarge a dataframe where the -values are determined conditionally. - -Consider you have two choices to choose from in the following dataframe. And you want to -set a new column color to 'green' when the second column has 'Z'. You can do the -following: - -.. ipython:: python - - df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')}) - df['color'] = np.where(df['col2'] == 'Z', 'green', 'red') - df - -If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say -corresponding to three conditions there are three choice of colors, with a fourth color -as a fallback, you can do the following. - -.. ipython:: python - - conditions = [ - (df['col2'] == 'Z') & (df['col1'] == 'A'), - (df['col2'] == 'Z') & (df['col1'] == 'B'), - (df['col1'] == 'B') - ] - choices = ['yellow', 'blue', 'purple'] - df['color'] = np.select(conditions, choices, default='black') - df - .. _indexing.query: The :meth:`~pandas.DataFrame.query` Method diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 2d5673fe53be3..be38736f493b5 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -117,7 +117,7 @@ dtype if needed. # coerce when needed s + 0.01 -These dtypes can operate as part of ``DataFrame``. +These dtypes can operate as part of of ``DataFrame``. .. ipython:: python diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..1c271e74aafba 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1024,10 +1024,9 @@ Writing CSVs to binary file objects .. versionadded:: 1.2.0 -``df.to_csv(..., mode="wb")`` allows writing a CSV to a file object -opened binary mode. In most cases, it is not necessary to specify -``mode`` as Pandas will auto-detect whether the file object is -opened in text or binary mode. +``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object +opened binary mode. For this to work, it is necessary that ``mode`` +contains a "b": .. ipython:: python @@ -1035,7 +1034,7 @@ opened in text or binary mode. data = pd.DataFrame([0, 1, 2]) buffer = io.BytesIO() - data.to_csv(buffer, encoding="utf-8", compression="gzip") + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") .. _io.float_precision: diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index d8998a9a0a6e1..f1a28dc30dd68 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -194,7 +194,7 @@ behavior: }, index=[2, 3, 6, 7], ) - result = pd.concat([df1, df4], axis=1) + result = pd.concat([df1, df4], axis=1, sort=False) .. ipython:: python @@ -204,6 +204,13 @@ behavior: p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); +.. warning:: + + The default behavior with ``join='outer'`` is to sort the other axis + (columns in this case). In a future version of pandas, the default will + be to not sort. We specified ``sort=False`` to opt in to the new + behavior now. + Here is the same thing with ``join='inner'``: .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index c828bc28826b1..d222297abc70b 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -124,13 +124,13 @@ are restored automatically when you exit the ``with`` block: Setting startup options in Python/IPython environment ----------------------------------------------------- -Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default IPython profile can be found at: +Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at: .. code-block:: none $IPYTHONDIR/profile_default/startup -More information can be found in the `IPython documentation +More information can be found in the `ipython documentation `__. An example startup script for pandas is displayed below: .. code-block:: python @@ -332,7 +332,7 @@ display.large_repr truncate For DataFrames exceeding ma (the behaviour in earlier versions of pandas). allowable settings, ['truncate', 'info'] display.latex.repr False Whether to produce a latex DataFrame - representation for Jupyter frontends + representation for jupyter frontends that support it. display.latex.escape True Escapes special characters in DataFrames, when using the to_latex method. @@ -413,7 +413,7 @@ display.show_dimensions truncate Whether to print out dimens frame is truncated (e.g. not display all rows and/or columns) display.width 80 Width of the display in characters. - In case Python/IPython is running in + In case python/IPython is running in a terminal this can be set to None and pandas will correctly auto-detect the width. Note that the IPython notebook, diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index e4eea57c43dbb..3156e3088d860 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -179,7 +179,7 @@ sparse values instead. rather than a SparseSeries or SparseDataFrame. This section provides some guidance on migrating your code to the new style. As a reminder, -you can use the Python warnings module to control warnings. But we recommend modifying +you can use the python warnings module to control warnings. But we recommend modifying your code, rather than ignoring the warning. **Construction** diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 24f344488d1ca..12dd72f761408 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -793,8 +793,7 @@ "source": [ "The next option you have are \"table styles\".\n", "These are styles that apply to the table as a whole, but don't look at the data.\n", - "Certain stylings, including pseudo-selectors like `:hover` can only be used this way.\n", - "These can also be used to set specific row or column based class selectors, as will be shown." + "Certain stylings, including pseudo-selectors like `:hover` can only be used this way." ] }, { @@ -832,32 +831,9 @@ "The value for `props` should be a list of tuples of `('attribute', 'value')`.\n", "\n", "`table_styles` are extremely flexible, but not as fun to type out by hand.\n", - "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here.\n", - "\n", - "`table_styles` can be used to add column and row based class descriptors. For large tables this can increase performance by avoiding repetitive individual css for each cell, and it can also simplify style construction in some cases.\n", - "If `table_styles` is given as a dictionary each key should be a specified column or index value and this will map to specific class CSS selectors of the given column or row.\n", - "\n", - "Note that `Styler.set_table_styles` will overwrite existing styles but can be chained by setting the `overwrite` argument to `False`." + "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "html = html.set_table_styles({\n", - " 'B': [dict(selector='', props=[('color', 'green')])],\n", - " 'C': [dict(selector='td', props=[('color', 'red')])], \n", - " }, overwrite=False)\n", - "html" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, { "cell_type": "markdown", "metadata": {}, @@ -946,12 +922,10 @@ "- DataFrame only `(use Series.to_frame().style)`\n", "- The index and columns must be unique\n", "- No large repr, and performance isn't great; this is intended for summary DataFrames\n", - "- You can only style the *values*, not the index or columns (except with `table_styles` above)\n", + "- You can only style the *values*, not the index or columns\n", "- You can only apply styles, you can't insert new HTML entities\n", "\n", - "Some of these will be addressed in the future.\n", - "Performance can suffer when adding styles to each cell in a large DataFrame.\n", - "It is recommended to apply table or column based styles where possible to limit overall HTML length, as well as setting a shorter UUID to avoid unnecessary repeated data transmission. \n" + "Some of these will be addressed in the future.\n" ] }, { diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index bee72ec70d95e..8044172bc4c4a 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -588,12 +588,10 @@ would include matching times on an included date: .. warning:: - Indexing ``DataFrame`` rows with a *single* string with getitem (e.g. ``frame[dtstring]``) - is deprecated starting with pandas 1.2.0 (given the ambiguity whether it is indexing - the rows or selecting a column) and will be removed in a future version. The equivalent - with ``.loc`` (e.g. ``frame.loc[dtstring]``) is still supported. + Indexing ``DataFrame`` rows with strings is deprecated in pandas 1.2.0 and will be removed in a future version. Use ``frame.loc[dtstring]`` instead. .. ipython:: python + :okwarning: dft = pd.DataFrame( np.random.randn(100000, 1), @@ -601,30 +599,34 @@ would include matching times on an included date: index=pd.date_range("20130101", periods=100000, freq="T"), ) dft - dft.loc["2013"] + dft["2013"] This starts on the very first time in the month, and includes the last date and time for the month: .. ipython:: python + :okwarning: dft["2013-1":"2013-2"] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python + :okwarning: dft["2013-1":"2013-2-28"] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python + :okwarning: dft["2013-1":"2013-2-28 00:00:00"] We are stopping on the included end-point as it is part of the index: .. ipython:: python + :okwarning: dft["2013-1-15":"2013-1-15 12:30:00"] @@ -650,6 +652,7 @@ We are stopping on the included end-point as it is part of the index: Slicing with string indexing also honors UTC offset. .. ipython:: python + :okwarning: df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific")) df @@ -701,14 +704,15 @@ If index resolution is second, then the minute-accurate timestamp gives a series_second.index.resolution series_second["2011-12-31 23:59"] -If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``.loc[]`` as well. +If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. .. ipython:: python + :okwarning: dft_minute = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index ) - dft_minute.loc["2011-12-31 23"] + dft_minute["2011-12-31 23"] .. warning:: @@ -1576,6 +1580,11 @@ some advanced strategies. The ``resample()`` method can be used directly from ``DataFrameGroupBy`` objects, see the :ref:`groupby docs `. +.. note:: + + ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with + a time-based offset, see a discussion :ref:`here `. + Basics ~~~~~~ @@ -1721,7 +1730,7 @@ We can instead only resample those groups where we have points as follows: Aggregation ~~~~~~~~~~~ -Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window API `, +Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window functions API `, a ``Resampler`` can be selectively resampled. Resampling a ``DataFrame``, the default will be to act on all columns with the same function. @@ -2120,6 +2129,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python + :okwarning: ps["2011"] @@ -2129,7 +2139,7 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), ) dfp - dfp.loc["2013-01-01 10H"] + dfp["2013-01-01 10H"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst deleted file mode 100644 index 05f8be091fa25..0000000000000 --- a/doc/source/user_guide/window.rst +++ /dev/null @@ -1,593 +0,0 @@ -.. _window: - -{{ header }} - -******************** -Windowing Operations -******************** - -pandas contains a compact set of APIs for performing windowing operations - an operation that performs -an aggregation over a sliding partition of values. The API functions similarly to the ``groupby`` API -in that :class:`Series` and :class:`DataFrame` call the windowing method with -necessary parameters and then subsequently call the aggregation function. - -.. ipython:: python - - s = pd.Series(range(5)) - s.rolling(window=2).sum() - -The windows are comprised by looking back the length of the window from the current observation. -The result above can be derived by taking the sum of the following windowed partitions of data: - -.. ipython:: python - - for window in s.rolling(window=2): - print(window) - - -.. _window.overview: - -Overview --------- - -pandas supports 4 types of windowing operations: - -#. Rolling window: Generic fixed or variable sliding window over the values. -#. Weighted window: Weighted, non-rectangular window supplied by the ``scipy.signal`` library. -#. Expanding window: Accumulating window over the values. -#. Exponentially Weighted window: Accumulating and exponentially weighted window over the values. - -============================= ================= =========================== =========================== ======================== -Concept Method Returned Object Supports time-based windows Supports chained groupby -============================= ================= =========================== =========================== ======================== -Rolling window ``rolling`` ``Rolling`` Yes Yes -Weighted window ``rolling`` ``Window`` No No -Expanding window ``expanding`` ``Expanding`` No Yes -Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) -============================= ================= =========================== =========================== ======================== - -As noted above, some operations support specifying a window based on a time offset: - -.. ipython:: python - - s = pd.Series(range(5), index=pd.date_range('2020-01-01', periods=5, freq='1D')) - s.rolling(window='2D').sum() - -Additionally, some methods support chaining a ``groupby`` operation with a windowing operation -which will first group the data by the specified keys and then perform a windowing operation per group. - -.. ipython:: python - - df = pd.DataFrame({'A': ['a', 'b', 'a', 'b', 'a'], 'B': range(5)}) - df.groupby('A').expanding().sum() - -.. note:: - - Windowing operations currently only support numeric data (integer and float) - and will always return ``float64`` values. - -.. warning:: - - Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical - imprecision due to the underlying windowing algorithms accumulating sums. When values differ - with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be - noted, that large values may have an impact on windows, which do not include these values. `Kahan summation - `__ is used - to compute the rolling sums to preserve accuracy as much as possible. - - -All windowing operations support a ``min_periods`` argument that dictates the minimum amount of -non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``. -``min_peridos`` defaults to 1 for time-based windows and ``window`` for fixed windows - -.. ipython:: python - - s = pd.Series([np.nan, 1, 2, np.nan, np.nan, 3]) - s.rolling(window=3, min_periods=1).sum() - s.rolling(window=3, min_periods=2).sum() - # Equivalent to min_periods=3 - s.rolling(window=3, min_periods=None).sum() - - -Additionally, all windowing operations supports the ``aggregate`` method for returning a result -of multiple aggregations applied to a window. - -.. ipython:: python - - df = pd.DataFrame({"A": range(5), "B": range(10, 15)}) - df.expanding().agg([np.sum, np.mean, np.std]) - - -.. _window.generic: - -Rolling window --------------- - -Generic rolling windows support specifying windows as a fixed number of observations or variable -number of observations based on an offset. If a time based offset is provided, the corresponding -time based index must be monotonic. - -.. ipython:: python - - times = ['2020-01-01', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-29'] - s = pd.Series(range(5), index=pd.DatetimeIndex(times)) - s - # Window with 2 observations - s.rolling(window=2).sum() - # Window with 2 days worth of observations - s.rolling(window='2D').sum() - -For all supported aggregation functions, see :ref:`api.functions_rolling`. - -.. _window.center: - -Centering windows -~~~~~~~~~~~~~~~~~ - -By default the labels are set to the right edge of the window, but a -``center`` keyword is available so the labels can be set at the center. - -.. ipython:: python - - s = pd.Series(range(10)) - s.rolling(window=5).mean() - s.rolling(window=5, center=True).mean() - - -.. _window.endpoints: - -Rolling window endpoints -~~~~~~~~~~~~~~~~~~~~~~~~ - -The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed`` -parameter: - -============= ==================== -Value Behavior -============= ==================== -``right'`` close right endpoint -``'left'`` close left endpoint -``'both'`` close both endpoints -``'neither'`` open endpoints -============= ==================== - -For example, having the right endpoint open is useful in many problems that require that there is no contamination -from present information back to past information. This allows the rolling window to compute statistics -"up to that point in time", but not including that point in time. - -.. ipython:: python - - df = pd.DataFrame( - {"x": 1}, - index=[ - pd.Timestamp("20130101 09:00:01"), - pd.Timestamp("20130101 09:00:02"), - pd.Timestamp("20130101 09:00:03"), - pd.Timestamp("20130101 09:00:04"), - pd.Timestamp("20130101 09:00:06"), - ], - ) - - df["right"] = df.rolling("2s", closed="right").x.sum() # default - df["both"] = df.rolling("2s", closed="both").x.sum() - df["left"] = df.rolling("2s", closed="left").x.sum() - df["neither"] = df.rolling("2s", closed="neither").x.sum() - - df - - -.. _window.custom_rolling_window: - -Custom window rolling -~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 1.0 - -In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts -a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. -The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns -a tuple of two arrays, the first being the starting indices of the windows and second being the -ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` -and will automatically be passed to ``get_window_bounds`` and the defined method must -always accept these arguments. - -For example, if we have the following :class:``DataFrame``: - -.. ipython:: python - - use_expanding = [True, False, True, False, True] - use_expanding - df = pd.DataFrame({"values": range(5)}) - df - -and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size -1, we can create the following ``BaseIndexer`` subclass: - -.. code-block:: ipython - - In [2]: from pandas.api.indexers import BaseIndexer - ...: - ...: class CustomIndexer(BaseIndexer): - ...: - ...: def get_window_bounds(self, num_values, min_periods, center, closed): - ...: start = np.empty(num_values, dtype=np.int64) - ...: end = np.empty(num_values, dtype=np.int64) - ...: for i in range(num_values): - ...: if self.use_expanding[i]: - ...: start[i] = 0 - ...: end[i] = i + 1 - ...: else: - ...: start[i] = i - ...: end[i] = i + self.window_size - ...: return start, end - ...: - - In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - In [4]: df.rolling(indexer).sum() - Out[4]: - values - 0 0.0 - 1 1.0 - 2 3.0 - 3 3.0 - 4 10.0 - -You can view other examples of ``BaseIndexer`` subclasses `here `__ - -.. versionadded:: 1.1 - -One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows -rolling operations over a non-fixed offset like a ``BusinessDay``. - -.. ipython:: python - - from pandas.api.indexers import VariableOffsetWindowIndexer - - df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) - offset = pd.offsets.BDay(1) - indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) - df - df.rolling(indexer).sum() - -For some problems knowledge of the future is available for analysis. For example, this occurs when -each data point is a full time series read from an experiment, and the task is to extract underlying -conditions. In these cases it can be useful to perform forward-looking rolling window computations. -:func:`FixedForwardWindowIndexer ` class is available for this purpose. -This :func:`BaseIndexer ` subclass implements a closed fixed-width -forward-looking rolling window, and we can use it as follows: - -.. ipython:: ipython - - from pandas.api.indexers import FixedForwardWindowIndexer - indexer = FixedForwardWindowIndexer(window_size=2) - df.rolling(indexer, min_periods=1).sum() - - -.. _window.rolling_apply: - -Rolling apply -~~~~~~~~~~~~~ - -The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs -generic rolling computations. The ``func`` argument should be a single function -that produces a single value from an ndarray input. ``raw`` specifies whether -the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objects (``raw=True``). - -.. ipython:: python - - def mad(x): - return np.fabs(x - x.mean()).mean() - - s = pd.Series(range(10)) - s.rolling(window=4).apply(mad, raw=True) - - -.. _window.numba_engine: - -Numba engine -~~~~~~~~~~~~ - -.. versionadded:: 1.0 - -Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ -if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying -``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). -Numba will be applied in potentially two routines: - -#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. -#. The engine will JIT the for loop where the apply function is applied to each window. - -The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. -These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, -and their default values are set to ``False``, ``True`` and ``False`` respectively. - -.. note:: - - In terms of performance, **the first time a function is run using the Numba engine will be slow** - as Numba will have some function compilation overhead. However, the compiled functions are cached, - and subsequent calls will be fast. In general, the Numba engine is performant with - a larger amount of data points (e.g. 1+ million). - -.. code-block:: ipython - - In [1]: data = pd.Series(range(1_000_000)) - - In [2]: roll = data.rolling(10) - - In [3]: def f(x): - ...: return np.sum(x) + 5 - # Run the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999 - 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) - # Function is cached and performance will improve - In [5]: %timeit roll.apply(f, engine='numba', raw=True) - 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [6]: %timeit roll.apply(f, engine='cython', raw=True) - 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -.. _window.cov_corr: - -Binary window functions -~~~~~~~~~~~~~~~~~~~~~~~ - -:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about -two :class:`Series` or any combination of :class:`DataFrame`/:class:`Series` or -:class:`DataFrame`/:class:`DataFrame`. Here is the behavior in each case: - -* two :class:`Series`: compute the statistic for the pairing. -* :class:`DataFrame`/:class:`Series`: compute the statistics for each column of the DataFrame - with the passed Series, thus returning a DataFrame. -* :class:`DataFrame`/:class:`DataFrame`: by default compute the statistic for matching column - names, returning a DataFrame. If the keyword argument ``pairwise=True`` is - passed then computes the statistic for each pair of columns, returning a - ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section - `). - -For example: - -.. ipython:: python - - df = pd.DataFrame( - np.random.randn(10, 4), - index=pd.date_range("2020-01-01", periods=10), - columns=["A", "B", "C", "D"], - ) - df = df.cumsum() - - df2 = df[:4] - df2.rolling(window=2).corr(df2["B"]) - -.. _window.corr_pairwise: - -Computing rolling pairwise covariances and correlations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In financial data analysis and other fields it's common to compute covariance -and correlation matrices for a collection of time series. Often one is also -interested in moving-window covariance and correlation matrices. This can be -done by passing the ``pairwise`` keyword argument, which in the case of -:class:`DataFrame` inputs will yield a MultiIndexed :class:`DataFrame` whose ``index`` are the dates in -question. In the case of a single DataFrame argument the ``pairwise`` argument -can even be omitted: - -.. note:: - - Missing values are ignored and each entry is computed using the pairwise - complete observations. Please see the :ref:`covariance section - ` for :ref:`caveats - ` associated with this method of - calculating covariance and correlation matrices. - -.. ipython:: python - - covs = ( - df[["B", "C", "D"]] - .rolling(window=4) - .cov(df[["A", "B", "C"]], pairwise=True) - ) - covs - - -.. _window.weighted: - -Weighted window ---------------- - -The ``win_type`` argument in ``.rolling`` generates a weighted windows that are commonly used in filtering -and spectral estimation. ``win_type`` must be string that corresponds to a `scipy.signal window function -`__. -Scipy must be installed in order to use these windows, and supplementary arguments -that the Scipy window methods take must be specified in the aggregation function. - - -.. ipython:: python - - s = pd.Series(range(10)) - s.rolling(window=5).mean() - s.rolling(window=5, win_type="triang").mean() - # Supplementary Scipy arguments passed in the aggregation function - s.rolling(window=5, win_type="gaussian").mean(std=0.1) - -For all supported aggregation functions, see :ref:`api.functions_window`. - -.. _window.expanding: - -Expanding window ----------------- - -An expanding window yields the value of an aggregation statistic with all the data available up to that -point in time. Since these calculations are a special case of rolling statistics, -they are implemented in pandas such that the following two calls are equivalent: - -.. ipython:: python - - df = pd.DataFrame(range(5)) - df.rolling(window=len(df), min_periods=1).mean() - df.expanding(min_periods=1).mean() - -For all supported aggregation functions, see :ref:`api.functions_expanding`. - - -.. _window.exponentially_weighted: - -Exponentially Weighted window ------------------------------ - -An exponentially weighted window is similar to an expanding window but with each prior point -being exponentially weighted down relative to the current point. - -In general, a weighted moving average is calculated as - -.. math:: - - y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i}, - -where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i` -are the weights. - -For all supported aggregation functions, see :ref:`api.functions_ewm`. - -The EW functions support two variants of exponential weights. -The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i` -which gives - -.. math:: - - y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... - + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... - + (1 - \alpha)^t} - -When ``adjust=False`` is specified, moving averages are calculated as - -.. math:: - - y_0 &= x_0 \\ - y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, - -which is equivalent to using weights - -.. math:: - - w_i = \begin{cases} - \alpha (1 - \alpha)^i & \text{if } i < t \\ - (1 - \alpha)^i & \text{if } i = t. - \end{cases} - -.. note:: - - These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g. - - .. math:: - - y_t = \alpha' y_{t-1} + (1 - \alpha') x_t. - -The difference between the above two variants arises because we are -dealing with series which have finite history. Consider a series of infinite -history, with ``adjust=True``: - -.. math:: - - y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} - {1 + (1 - \alpha) + (1 - \alpha)^2 + ...} - -Noting that the denominator is a geometric series with initial term equal to 1 -and a ratio of :math:`1 - \alpha` we have - -.. math:: - - y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} - {\frac{1}{1 - (1 - \alpha)}}\\ - &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\ - &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\ - &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\ - &= \alpha x_t + (1 - \alpha) y_{t-1} - -which is the same expression as ``adjust=False`` above and therefore -shows the equivalence of the two variants for infinite series. -When ``adjust=False``, we have :math:`y_0 = x_0` and -:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`. -Therefore, there is an assumption that :math:`x_0` is not an ordinary value -but rather an exponentially weighted moment of the infinite series up to that -point. - -One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass -:math:`\alpha` directly, it's often easier to think about either the -**span**, **center of mass (com)** or **half-life** of an EW moment: - -.. math:: - - \alpha = - \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 - \end{cases} - -One must specify precisely one of **span**, **center of mass**, **half-life** -and **alpha** to the EW functions: - -* **Span** corresponds to what is commonly called an "N-day EW moving average". -* **Center of mass** has a more physical interpretation and can be thought of - in terms of span: :math:`c = (s - 1) / 2`. -* **Half-life** is the period of time for the exponential weight to reduce to - one half. -* **Alpha** specifies the smoothing factor directly. - -.. versionadded:: 1.1.0 - -You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of -time it takes for an observation to decay to half its value when also specifying a sequence -of ``times``. - -.. ipython:: python - - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - df - times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] - df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() - -The following formula is used to compute exponentially weighted mean with an input vector of times: - -.. math:: - - y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}}, - - -ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how -intermediate null values affect the calculation of the weights. -When ``ignore_na=False`` (the default), weights are calculated based on absolute -positions, so that intermediate null values affect the result. -When ``ignore_na=True``, -weights are calculated by ignoring intermediate null values. -For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted -average of ``3, NaN, 5`` would be calculated as - -.. math:: - - \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}. - -Whereas if ``ignore_na=True``, the weighted average would be calculated as - -.. math:: - - \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}. - -The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument, -specifying whether the result should contain biased or unbiased statistics. -For example, if ``bias=True``, ``ewmvar(x)`` is calculated as -``ewmvar(x) = ewma(x**2) - ewma(x)**2``; -whereas if ``bias=False`` (the default), the biased variance statistics -are scaled by debiasing factors - -.. math:: - - \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}. - -(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, -with :math:`N = t + 1`.) -See `Weighted Sample Variance `__ -on Wikipedia for further details. diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index c12adb2f1334f..4de76510c6bc1 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -419,7 +419,7 @@ Bug fixes ~~~~~~~~~ - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have a dtype of ``object`` (:issue:`1818`, + if the associated objects have have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -430,8 +430,8 @@ Bug fixes - ``Series.str`` now supports iteration (:issue:`3638`). You can iterate over the individual elements of each string in the ``Series``. Each iteration yields - a ``Series`` with either a single character at each index of the original - ``Series`` or ``NaN``. For example, + yields a ``Series`` with either a single character at each index of the + original ``Series`` or ``NaN``. For example, .. ipython:: python :okwarning: diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index b59938a9b9c9b..f2401c812a979 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -171,7 +171,7 @@ API changes ``expanding_cov``, ``expanding_corr`` to allow the calculation of moving window covariance and correlation matrices (:issue:`4950`). See :ref:`Computing rolling pairwise covariances and correlations - ` in the docs. + ` in the docs. .. code-block:: ipython @@ -923,7 +923,7 @@ Bug fixes - ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`) - ``HDFStore.select_as_coordinates`` and ``select_column`` works with a ``where`` clause that results in filters (:issue:`6177`) - Regression in join of non_unique_indexes (:issue:`6329`) -- Issue with groupby ``agg`` with a single function and a mixed-type frame (:issue:`6337`) +- Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) - Bug in ``DataFrame.replace()`` when passing a non- ``bool`` ``to_replace`` argument (:issue:`6332`) - Raise when trying to align on different levels of a MultiIndex assignment (:issue:`3738`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index fc2b070df4392..1f054930b3709 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -405,7 +405,7 @@ Rolling/expanding moments improvements - :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those - calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) + calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) .. ipython:: python @@ -490,7 +490,7 @@ Rolling/expanding moments improvements now have an optional ``adjust`` argument, just like :func:`ewma` does, affecting how the weights are calculated. The default value of ``adjust`` is ``True``, which is backwards-compatible. - See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`) + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`) - :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` now have an optional ``ignore_na`` argument. @@ -595,7 +595,7 @@ Rolling/expanding moments improvements 3 1.425439 dtype: float64 - See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`) + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`) .. _whatsnew_0150.sql: diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index b5b25796fea73..95ca925f18692 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -136,7 +136,7 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. -- Added ability to export Categorical data to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. +- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. - Added support for ``searchsorted()`` on ``Categorical`` class (:issue:`8420`). Other enhancements: diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 269854111373f..39767684c01d0 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -6,7 +6,7 @@ Version 0.16.1 (May 11, 2015) {{ header }} -This is a minor bug-fix release from 0.16.0 and includes a large number of +This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -72,7 +72,7 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv Out[4]: Index(['c', 'a', 'b'], dtype='object') -setting the index, will create a ``CategoricalIndex`` +setting the index, will create create a ``CategoricalIndex`` .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index 37e8c64ea9ced..194bb61f2c1c8 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -6,7 +6,7 @@ Version 0.16.2 (June 12, 2015) {{ header }} -This is a minor bug-fix release from 0.16.1 and includes a large number of +This is a minor bug-fix release from 0.16.1 and includes a a large number of bug fixes along some new features (:meth:`~DataFrame.pipe` method), enhancements, and performance improvements. We recommend that all users upgrade to this version. diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 829c04dac9f2d..ef5242b0e33c8 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -53,7 +53,7 @@ New features Window functions are now methods ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`) +Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`) .. ipython:: python @@ -610,7 +610,7 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1 pd.Timestamp('2012-01-01') - ser -``NaT.isoformat()`` now returns ``'NaT'``. This change allows +``NaT.isoformat()`` now returns ``'NaT'``. This change allows allows ``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat (:issue:`12300`). diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 340e1ce9ee1ef..2ac7b0f54361b 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -135,7 +135,7 @@ Method ``.rolling()`` is now time-series aware ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`). -See the full documentation :ref:`here `. +See the full documentation :ref:`here `. .. ipython:: python diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 2cb8e13e9a18a..a9e57f0039735 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -459,7 +459,7 @@ Selecting via a scalar value that is contained *in* the intervals. Other enhancements ^^^^^^^^^^^^^^^^^^ -- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) +- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) @@ -988,7 +988,7 @@ A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ` will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here `. These are equivalent in function, but a MultiIndexed ``DataFrame`` enjoys more support in pandas. -See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) +See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) .. ipython:: python @@ -1167,7 +1167,7 @@ Other API changes - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) -- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than one byte (:issue:`11592`) +- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) @@ -1315,7 +1315,7 @@ The recommended methods of indexing are: - ``.loc`` if you want to *label* index - ``.iloc`` if you want to *positionally* index. -Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `__. +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. .. ipython:: python @@ -1663,11 +1663,11 @@ Indexing - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) - Bug in ``.reset_index()`` when raising error for index name already present in ``MultiIndex`` columns (:issue:`16120`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) -- Bug in the HTML display with a ``MultiIndex`` and truncation (:issue:`14882`) +- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) -- Bug in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) +- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) - Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`) IO diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 1bbbbdc7e5410..6035b89aa8643 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -50,7 +50,7 @@ Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, su dtypes, including extension dtypes such as datetime with timezones. This functionality depends on either the `pyarrow `__ or `fastparquet `__ library. -For more details, see :ref:`the IO docs on Parquet `. +For more details, see see :ref:`the IO docs on Parquet `. .. _whatsnew_0210.enhancements.infer_objects: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ce784231a47d2..9ef50045d5b5e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1622,7 +1622,7 @@ Timedelta - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) - Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) - Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) @@ -1868,7 +1868,7 @@ Reshaping - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) - Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 253ca4d4188e5..8ff688eaa91e7 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -15,7 +15,7 @@ New features ~~~~~~~~~~~~ - :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` - :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to DataFrame (:issue:`296`) +- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (:issue:`296`) - :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (:issue:`289`) - :ref:`Added ` ``float_format`` option to ``Series.to_string`` - :ref:`Added ` ``skip_footer`` (:issue:`291`) and ``converters`` (:issue:`343`) options to ``read_csv`` and ``read_table`` diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index 139c6e2d1cb0c..8ee80fa2c44b1 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -25,12 +25,12 @@ New features constructor (:issue:`444`) - DataFrame.convert_objects method for :ref:`inferring better dtypes ` for object columns (:issue:`302`) -- Add :ref:`rolling_corr_pairwise ` function for +- Add :ref:`rolling_corr_pairwise ` function for computing Panel of correlation matrices (:issue:`189`) - Add :ref:`margins ` option to :ref:`pivot_table ` for computing subgroup aggregates (:issue:`114`) - Add ``Series.from_csv`` function (:issue:`482`) -- :ref:`Can pass ` DataFrame/DataFrame and +- :ref:`Can pass ` DataFrame/DataFrame and DataFrame/Series to rolling_corr/rolling_cov (GH #462) - MultiIndex.get_level_values can :ref:`accept the level name ` diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 781054fc4de7c..b34c2a5c6a07c 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -81,7 +81,7 @@ Time Series changes and improvements timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only - time zone names as strings are required. Time zone-aware timestamps are + time zone names as as strings are required. Time zone-aware timestamps are equal if and only if their UTC timestamps match. Operations between time zone-aware time series with different time zones will result in a UTC-indexed time series. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6512e4cce02a9..8f9ceb30a947a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -46,7 +46,7 @@ We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the routine using `Numba `__ instead of Cython. Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see -:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) .. _whatsnew_100.custom_window: @@ -57,7 +57,7 @@ We've added a :func:`pandas.api.indexers.BaseIndexer` class that allows users to window bounds are created during ``rolling`` operations. Users can define their own ``get_window_bounds`` method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate the start and end indices used for each window during the rolling aggregation. For more details and example usage, see -the :ref:`custom window rolling documentation ` +the :ref:`custom window rolling documentation ` .. _whatsnew_100.to_markdown: diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 46c4ad4f35fe4..a29ae1912e338 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -14,15 +14,8 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) -- Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) -- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) -- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) -- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) -- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) -- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) -- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) -- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- +- .. --------------------------------------------------------------------------- @@ -30,15 +23,10 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in pytables methods in python 3.9 (:issue:`38041`) - -.. --------------------------------------------------------------------------- - -.. _whatsnew_115.other: - -Other -~~~~~ -- Only set ``-Werror`` as a compiler flag in the CI jobs (:issue:`33315`, :issue:`33314`) +- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ac8132339d38c..d1899e1d72509 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -22,7 +22,7 @@ Optionally disallow duplicate labels control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to prevent accidental introduction of duplicate labels, which can affect downstream operations. -By default, duplicates continue to be allowed. +By default, duplicates continue to be allowed .. ipython:: python @@ -84,8 +84,7 @@ Support for binary file handles in ``to_csv`` :meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). -If pandas does not automatically detect whether the file handle is opened in binary or text mode, -it is necessary to provide ``mode="wb"``. +``mode`` has to contain a ``b`` for binary handles to be supported. For example: @@ -95,7 +94,7 @@ For example: data = pd.DataFrame([0, 1, 2]) buffer = io.BytesIO() - data.to_csv(buffer, encoding="utf-8", compression="gzip") + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") Support for short caption and table position in ``to_latex`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -104,7 +103,7 @@ Support for short caption and table position in ``to_latex`` a floating table position (:issue:`35281`) and a short caption (:issue:`36267`). -The keyword ``position`` has been added to set the position. +New keyword ``position`` is implemented to set the position. .. ipython:: python @@ -112,9 +111,9 @@ The keyword ``position`` has been added to set the position. table = data.to_latex(position='ht') print(table) -Usage of the keyword ``caption`` has been extended. +Usage of keyword ``caption`` is extended. Besides taking a single string as an argument, -one can optionally provide a tuple ``(full_caption, short_caption)`` +one can optionally provide a tuple of ``(full_caption, short_caption)`` to add a short caption macro. .. ipython:: python @@ -141,12 +140,12 @@ parser by default should have no impact on performance. (:issue:`17154`) Experimental nullable data types for float data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`. -These are extension data types dedicated to floating point data that can hold the +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, +an extension data type dedicated to floating point data that can hold the ``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). While the default float data type already supports missing values using ``np.nan``, -these new data types use ``pd.NA`` (and its corresponding behaviour) as the missing +this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing value indicator, in line with the already existing nullable :ref:`integer ` and :ref:`boolean ` data types. @@ -180,7 +179,7 @@ Alternatively, you can also use the dtype object: .. warning:: - Experimental: the new floating data types are currently experimental, and their + Experimental: the new floating data types are currently experimental, and its behaviour or API may still change without warning. Especially the behaviour regarding NaN (distinct from NA missing values) is subject to change. @@ -189,8 +188,8 @@ Alternatively, you can also use the dtype object: Index/column name preservation when aggregating ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, pandas -will now attempt to preserve index and column names whenever possible (:issue:`35847`). +When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, Pandas +will attempt to preserve index (and column) names whenever possible (:issue:`35847`). In the case where all inputs share a common name, this name will be assigned to the result. When the input names do not all agree, the result will be unnamed. Here is an example where the index name is preserved: @@ -248,164 +247,37 @@ By defualt, backward resample uses ``closed=right`` while ``closed=left`` is als ts.resample("17min", closed="left", origin="end").sum() -.. _whatsnew_120.groupby_ewm: - -Groupby supports EWM operations directly -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:class:`.DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`). - -.. ipython:: python - - df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)}) - df - df.groupby('A').ewm(com=1.0).mean() - -Additionally ``mean`` supports execution via `Numba `__ with -the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency -to use this feature. - .. _whatsnew_120.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -- Added ``day_of_week`` (compatibility alias ``dayofweek``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) -- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) -- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`) +- Added ``day_of_week``(compatibility alias ``dayofweek``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) +- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`) +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`) -- :meth:`.Styler.set_table_styles` now allows the direct styling of rows and columns and can be chained (:issue:`35607`) -- :class:`.Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- :meth:`.Rolling.mean` and :meth:`.Rolling.sum` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) -- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) -- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). -- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) -- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) -- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) -- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) -- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) +- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). +- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) +- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`) - :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) -- :class:`DataFrame` now supports the ``divmod`` operation (:issue:`37165`) +- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) - :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) -- :class:`.Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) -- :class:`.DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) +- :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) +- :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) -- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) -- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) -- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) -- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) -- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) - -.. --------------------------------------------------------------------------- - -.. _whatsnew_120.notable_bug_fixes: - -Notable bug fixes -~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. - -Consistency of DataFrame Reductions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now -determines whether to exclude object-dtype columns on a column-by-column basis, -instead of checking if *all* object-dtype columns can be considered boolean. - -This prevents pathological behavior where applying the reduction on a subset -of columns could result in a larger Series result. See (:issue:`37799`). - -.. ipython:: python - - df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object) - df["C"] = pd.Series([True, True]) - - -*Previous behavior*: - -.. code-block:: ipython - - In [5]: df.all(bool_only=True) - Out[5]: - C True - dtype: bool - - In [6]: df[["B", "C"]].all(bool_only=True) - Out[6]: - B False - C True - dtype: bool - -*New behavior*: - -.. ipython:: python - - In [5]: df.all(bool_only=True) - - In [6]: df[["B", "C"]].all(bool_only=True) - - -Other DataFrame reductions with ``numeric_only=None`` will also avoid -this pathological behavior (:issue:`37827`): - -.. ipython:: python - - df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) - - -*Previous behavior*: - -.. code-block:: ipython - - In [3]: df.mean() - Out[3]: Series([], dtype: float64) - - In [4]: df[["A"]].mean() - Out[4]: - A 1.0 - dtype: float64 - -*New behavior*: - -.. ipython:: python - - df.mean() - - df[["A"]].mean() - -Moreover, DataFrame reductions with ``numeric_only=None`` will now be -consistent with their Series counterparts. In particular, for -reductions where the Series method raises ``TypeError``, the -DataFrame reduction will now consider that column non-numeric -instead of casting to a NumPy array which may have different semantics (:issue:`36076`, -:issue:`28949`, :issue:`21020`). - -.. ipython:: python - - ser = pd.Series([0, 1], dtype="category", name="A") - df = ser.to_frame() - - -*Previous behavior*: - -.. code-block:: ipython - - In [5]: df.any() - Out[5]: - A True - dtype: bool - -*New behavior*: - -.. ipython:: python - - df.any() - +- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) +- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) +- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) .. _whatsnew_120.api_breaking.python: @@ -493,11 +365,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`.DatetimeArray.take`, :meth:`TimedeltaArray.take`, or :meth:`PeriodArray.take` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) -- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) -- Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) -- Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) .. --------------------------------------------------------------------------- @@ -506,31 +374,24 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) -- Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) +- Deprecated parameter ``dtype`` in :meth:`~Index.copy` on method all index classes. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) +- Deprecated parameters ``levels`` and ``codes`` in :meth:`~MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) -- The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) -- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` - (given the ambiguity whether it is indexing the rows or selecting a column), use - ``df.loc[string]`` instead (:issue:`36179`) -- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`.DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) +- The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) +- Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) +- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) - The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) -- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) - :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) - :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`) -- Partial slicing on unordered :class:`.DatetimeIndex` objects with keys that are not in the index is deprecated and will be removed in a future version (:issue:`18531`) -- The ``how`` keyword in :meth:`PeriodIndex.astype` is deprecated and will be removed in a future version, use ``index.to_timestamp(how=how)`` instead (:issue:`37982`) -- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) -- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) -- The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) + .. --------------------------------------------------------------------------- @@ -541,22 +402,19 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) -- Performance improvement in :meth:`.GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- Performance improvements when creating :meth:`Series.map` from a huge dictionary (:issue:`34717`) -- Performance improvement in :meth:`.GroupBy.transform` with the ``numba`` engine (:issue:`36240`) -- :class:`.Styler` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) -- Performance improvement in :func:`to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) -- Performance improvement in setting values on an :class:`IntervalArray` (:issue:`36310`) +- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) +- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) +- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) +- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) +- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) - The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) -- Performance improvement in :meth:`.RollingGroupby.count` (:issue:`35625`) -- Small performance decrease to :meth:`.Rolling.min` and :meth:`.Rolling.max` for fixed windows (:issue:`36567`) +- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) +- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) -- Faster ``dir`` calls when the object has many index labels, e.g. ``dir(ser)`` (:issue:`37450`) +- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) -- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) -- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) -- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) .. --------------------------------------------------------------------------- @@ -567,41 +425,35 @@ Bug fixes Categorical ^^^^^^^^^^^ -- :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) +- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) -- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) -- Datetimelike ^^^^^^^^^^^^ -- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) +- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) -- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) -- Bug in :meth:`.DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`.DatetimeIndex` (:issue:`35690`) -- Bug in :meth:`.DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) -- Bug in :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or :class:`Period` dtype placement of ``NaT`` values being inconsistent with NumPy (:issue:`36176`, :issue:`36254`) -- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) -- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) -- Bug in :class:`.DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) -- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) -- Bug in :meth:`.DatetimeIndex.equals` and :meth:`.TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) -- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement timezone parsing when orient structure is ``table`` (:issue:`35973`) -- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred timezone from string (:issue:`35973`) -- Bug in :meth:`.TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) -- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) -- Bug in adding a :class:`.BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) +- Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) +- Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) +- Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) +- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) +- Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) +- Bug in :meth:`DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) +- Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) +- :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) +- Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- :meth:`to_json` and :meth:`read_json` now implements timezones parsing when orient structure is 'table'. +- :meth:`astype` now attempts to convert to 'datetime64[ns, tz]' directly from 'object' with inferred timezone from string (:issue:`35973`). +- Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) +- Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) +- Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) - Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) -- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) -- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) -- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) Timedelta ^^^^^^^^^ -- Bug in :class:`.TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) -- Bug in parsing of ISO 8601 durations in :class:`Timedelta` and :func:`to_datetime` (:issue:`29773`, :issue:`36204`) +- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) +- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`) - Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`) -- Bug in :class:`Timedelta` incorrectly truncating to sub-second portion of a string input when it has precision higher than nanoseconds (:issue:`36738`) Timezones ^^^^^^^^^ @@ -615,17 +467,17 @@ Numeric - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) -- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) -- Bug in :mod:`pandas.testing` module functions when used with ``check_exact=False`` on complex numeric types (:issue:`28235`) +- Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) +- Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) -- Bug in :class:`.IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) - Bug in :class:`MultiIndex` comparison with tuple incorrectly treating tuple as array-like (:issue:`21517`) - Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) - Bug in :class:`DataFrame` arithmetic ops incorrectly accepting keyword arguments (:issue:`36843`) -- Bug in :class:`.IntervalArray` comparisons with :class:`Series` not returning Series (:issue:`36908`) +- Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`) - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) -- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) Conversion @@ -637,53 +489,36 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) -- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype Series containing only numeric strings and ``NA`` (:issue:`37262`) +- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype :class:`Series` containing only numeric strings and ``NA`` (:issue:`37262`) - Interval ^^^^^^^^ - -- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`) - Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`) -- Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`) -- Bug in :meth:`IntervalArray.astype` incorrectly dropping dtype information with a :class:`CategoricalDtype` object (:issue:`37984`) +- - Indexing ^^^^^^^^ -- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) +- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) -- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) - Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) - Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) - Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`) - Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`) - Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) -- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`) -- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) -- Bug in :meth:`DataFrame.loc.__setitem__` expanding an empty :class:`DataFrame` with mixed dtypes (:issue:`37932`) -- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) -- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) -- Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`) -- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`) -- Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) -- Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) -- Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) -- Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) -- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) -- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) +- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`) +- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) Missing ^^^^^^^ -- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) -- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) - MultiIndex @@ -692,103 +527,80 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) -- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) I/O ^^^ - :func:`read_sas` no longer leaks resources on failure (:issue:`35566`) -- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) -- Bug in :meth:`read_csv` with ``float_precision='round_trip'`` did not handle ``decimal`` and ``thousands`` parameters (:issue:`35365`) +- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- In :meth:`read_csv` ``float_precision='round_trip'`` now handles ``decimal`` and ``thousands`` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) - :meth:`to_csv` passes compression arguments for ``'gzip'`` always to ``gzip.GzipFile`` (:issue:`28103`) - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) -- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, :issue:`32392`) -- :meth:`DataFrame.to_pickle`, :meth:`Series.to_pickle`, and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, :issue:`29570`) +- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) +- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) - Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) - Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) -- Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when used with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) +- Bug in :meth:`to_json` with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) - Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) - Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`) - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) -- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty DataFrame with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) -- Bug in :class:`HDFStore` was dropping timezone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) - Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) -- :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) -- Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) -- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) -- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) -- :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) -- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) - -Period -^^^^^^ - -- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Period` dtypes would be converted to object dtypes (:issue:`34871`) Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) -- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`) -- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`) +- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) - Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) -- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was - indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`) -- Bug in :meth:`.DataFrameGroupBy.boxplot` when ``subplots=False`` would raise a ``KeyError`` (:issue:`16748`) -- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no ``sharey`` parameter was passed (:issue:`37942`) +- Bug in :meth:`DataFrameGroupBy.boxplot` when ``subplots=False``, a KeyError would raise (:issue:`16748`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) -- Bug in :meth:`.DataFrameGroupBy.apply` that would sometimes throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) -- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) -- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) -- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) +- Bug in :meth:`DataFrame.resample(...)` that would throw a ``ValueError`` when resampling from "D" to "24H" over a transition into daylight savings time (DST) (:issue:`35219`) +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) -- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) +- Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) -- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`) -- Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`) +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) - Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) -- Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) -- Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) -- Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`) -- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) -- Bug in :meth:`.Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) -- Using :meth:`.Rolling.var` instead of :meth:`.Rolling.std` avoids numerical issues for :meth:`.Rolling.corr` when :meth:`.Rolling.var` is still within floating point precision while :meth:`.Rolling.std` is not (:issue:`31286`) -- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.Resampler.quantile` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) -- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` returned wrong values for :class:`.BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) +- Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) +- Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). +- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) +- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`) +- Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) +- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) -- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) -- Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) -- Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) +- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) -- Bug in :meth:`DataFrame.stack` where an empty DataFrame.stack would raise an error (:issue:`36113`). Now returning an empty Series with empty MultiIndex. -- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ValueError. (:issue:`36113`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) -- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was a dictionary (:issue:`35811`) -- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns are both multiindexed (:issue:`36360`) -- Bug in :meth:`DataFrame.pivot` modified ``index`` argument when ``columns`` was passed but ``values`` was not (:issue:`37635`) -- Bug in :meth:`DataFrame.join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) -- Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) -- Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`) +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) +- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns both multiindexed (:issue:`36360`) +- Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) +- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) +- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) -- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) -- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) Sparse ^^^^^^ @@ -799,26 +611,23 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Fixed bug where :class:`DataFrame` column set to scalar extension type via a dict instantiation was considered an object type rather than the extension type (:issue:`35965`) -- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`28488`) -- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) -- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) -- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) +- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) +- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`) +- Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`) +- Fixed an inconsistency in :class:`PeriodArray`'s ``__init__`` signature to those of :class:`DatetimeArray` and :class:`TimedeltaArray` (:issue:`37289`) +- Reductions for :class:`BooleanArray`, :class:`Categorical`, :class:`DatetimeArray`, :class:`FloatingArray`, :class:`IntegerArray`, :class:`PeriodArray`, :class:`TimedeltaArray`, and :class:`PandasArray` are now keyword-only methods (:issue:`37541`) Other ^^^^^ -- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising an ``AssertionError`` instead of a ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) -- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series and DataFrames (:issue:`28283`) -- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`) - Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) -- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`) -- Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`) -- Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) -- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) -- Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) -- Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) +- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`) +- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) +- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) +- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). +- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index b99b856187fb6..77a9c5fd4822d 100644 --- a/environment.yml +++ b/environment.yml @@ -12,9 +12,6 @@ dependencies: - asv # building - # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. - - c-compiler - - cxx-compiler - cython>=0.29.21 # code checks diff --git a/pandas/__init__.py b/pandas/__init__.py index cc5d835a52833..cf7ae2505b72d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -33,7 +33,7 @@ raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --force' to build the C extensions first." + "'python setup.py build_ext --inplace --force' to build the C extensions first." ) from e from pandas._config import ( @@ -189,10 +189,25 @@ # GH 27101 +# TODO: remove Panel compat in 1.0 def __getattr__(name): import warnings - if name == "datetime": + if name == "Panel": + + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) + + class Panel: + pass + + return Panel + + elif name == "datetime": warnings.warn( "The pandas.datetime class is deprecated " "and will be removed from pandas in a future version. " diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 24156c88f0d76..5a958d5e0bd3c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -903,12 +903,13 @@ def group_last(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False + assert min_count == -1, "'min_count' only used in add and prod" + # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") - min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -938,7 +939,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -960,7 +961,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -985,9 +986,8 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, - int64_t min_count=-1, int64_t rank=1 - ): + const int64_t[:] labels, int64_t rank=1, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -998,12 +998,13 @@ def group_nth(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False + assert min_count == -1, "'min_count' only used in add and prod" + # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") - min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -1034,7 +1035,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1058,7 +1059,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1295,12 +1296,13 @@ def group_max(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs + assert min_count == -1, "'min_count' only used in add and prod" + # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") - min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) @@ -1337,12 +1339,11 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: if groupby_t is uint64_t: runtime_error = True break else: - out[i, j] = nan_val else: out[i, j] = maxx[i, j] @@ -1370,12 +1371,13 @@ def group_min(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs + assert min_count == -1, "'min_count' only used in add and prod" + # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") - min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) @@ -1411,7 +1413,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] == 0: if groupby_t is uint64_t: runtime_error = True break diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 7b630c264753f..75c273b35ee7d 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,27 +1,13 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( - float32_t, float64_t, - int8_t, - int16_t, - int32_t, int64_t, - kh_float32_t, kh_float64_t, - kh_int8_t, - kh_int16_t, - kh_int32_t, kh_int64_t, kh_pymap_t, kh_str_t, - kh_uint8_t, - kh_uint16_t, - kh_uint32_t, kh_uint64_t, - uint8_t, - uint16_t, - uint32_t, uint64_t, ) @@ -42,54 +28,12 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) -cdef class UInt32HashTable(HashTable): - cdef kh_uint32_t *table - - cpdef get_item(self, uint32_t val) - cpdef set_item(self, uint32_t key, Py_ssize_t val) - -cdef class Int32HashTable(HashTable): - cdef kh_int32_t *table - - cpdef get_item(self, int32_t val) - cpdef set_item(self, int32_t key, Py_ssize_t val) - -cdef class UInt16HashTable(HashTable): - cdef kh_uint16_t *table - - cpdef get_item(self, uint16_t val) - cpdef set_item(self, uint16_t key, Py_ssize_t val) - -cdef class Int16HashTable(HashTable): - cdef kh_int16_t *table - - cpdef get_item(self, int16_t val) - cpdef set_item(self, int16_t key, Py_ssize_t val) - -cdef class UInt8HashTable(HashTable): - cdef kh_uint8_t *table - - cpdef get_item(self, uint8_t val) - cpdef set_item(self, uint8_t key, Py_ssize_t val) - -cdef class Int8HashTable(HashTable): - cdef kh_int8_t *table - - cpdef get_item(self, int8_t val) - cpdef set_item(self, int8_t key, Py_ssize_t val) - cdef class Float64HashTable(HashTable): cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) -cdef class Float32HashTable(HashTable): - cdef kh_float32_t *table - - cpdef get_item(self, float32_t val) - cpdef set_item(self, float32_t key, Py_ssize_t val) - cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 963fddd4d5af9..5a0cddb0af197 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,14 +13,48 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t +from pandas._libs.khash cimport ( + kh_destroy_float64, + kh_destroy_int64, + kh_destroy_pymap, + kh_destroy_str, + kh_destroy_uint64, + kh_exist_float64, + kh_exist_int64, + kh_exist_pymap, + kh_exist_str, + kh_exist_uint64, + kh_float64_t, + kh_get_float64, + kh_get_int64, + kh_get_pymap, + kh_get_str, + kh_get_strbox, + kh_get_uint64, + kh_init_float64, + kh_init_int64, + kh_init_pymap, + kh_init_str, + kh_init_strbox, + kh_init_uint64, + kh_int64_t, + kh_put_float64, + kh_put_int64, + kh_put_pymap, + kh_put_str, + kh_put_strbox, + kh_put_uint64, + kh_resize_float64, + kh_resize_int64, + kh_resize_pymap, + kh_resize_str, + kh_resize_uint64, + kh_str_t, + khiter_t, +) from pandas._libs.missing cimport checknull -def get_hashtable_trace_domain(): - return KHASH_TRACE_DOMAIN - - cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b582ed1533a8e..da91fa69b0dec 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,35 +5,6 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -{{py: - -# name -cimported_types = ['float32', - 'float64', - 'int8', - 'int16', - 'int32', - 'int64', - 'pymap', - 'str', - 'strbox', - 'uint8', - 'uint16', - 'uint32', - 'uint64'] -}} - -{{for name in cimported_types}} -from pandas._libs.khash cimport ( - kh_destroy_{{name}}, - kh_exist_{{name}}, - kh_get_{{name}}, - kh_init_{{name}}, - kh_put_{{name}}, - kh_resize_{{name}}, -) -{{endfor}} - # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- @@ -49,16 +20,9 @@ from pandas._libs.missing cimport C_NA # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), - ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), - ('Int32', 'int32', 'int32_t'), - ('Int16', 'int16', 'int16_t'), - ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), - ('UInt64', 'uint64', 'uint64_t'), - ('UInt32', 'uint32', 'uint32_t'), - ('UInt16', 'uint16', 'uint16_t'), - ('UInt8', 'uint8', 'uint8_t')] + ('UInt64', 'uint64', 'uint64_t')] }} {{for name, dtype, c_type in dtypes}} @@ -85,15 +49,8 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData - Int32VectorData - Int16VectorData - Int8VectorData UInt64VectorData - UInt32VectorData - UInt16VectorData - UInt8VectorData Float64VectorData - Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -108,14 +65,7 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t'), - ('Float32', 'float32', 'float32_t'), - ('UInt32', 'uint32', 'uint32_t'), - ('Int32', 'int32', 'int32_t'), - ('UInt16', 'uint16', 'uint16_t'), - ('Int16', 'int16', 'int16_t'), - ('UInt8', 'uint8', 'uint8_t'), - ('Int8', 'int8', 'int8_t')] + ('Int64', 'int64', 'int64_t')] }} @@ -303,22 +253,15 @@ cdef class HashTable: {{py: -# name, dtype, float_group -dtypes = [('Float64', 'float64', True), - ('UInt64', 'uint64', False), - ('Int64', 'int64', False), - ('Float32', 'float32', True), - ('UInt32', 'uint32', False), - ('Int32', 'int32', False), - ('UInt16', 'uint16', False), - ('Int16', 'int16', False), - ('UInt8', 'uint8', False), - ('Int8', 'int8', False)] +# name, dtype, float_group, default_na_value +dtypes = [('Float64', 'float64', True, 'np.nan'), + ('UInt64', 'uint64', False, 0), + ('Int64', 'int64', False, 'NPY_NAT')] }} -{{for name, dtype, float_group in dtypes}} +{{for name, dtype, float_group, default_na_value in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -344,11 +287,9 @@ cdef class {{name}}HashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) - for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) - for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys - sizeof(Py_ssize_t)) # vals - return overhead + for_flags + for_pairs + return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, {{dtype}}_t val): cdef: @@ -489,7 +430,7 @@ cdef class {{name}}HashTable(HashTable): # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: - na_value2 = 0 + na_value2 = {{default_na_value}} with nogil: for i in range(n): @@ -671,11 +612,10 @@ cdef class StringHashTable(HashTable): self.table = NULL def sizeof(self, deep=False): - overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) - for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) - for_pairs = self.table.n_buckets * (sizeof(char *) + # keys - sizeof(Py_ssize_t)) # vals - return overhead + for_flags + for_pairs + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(char *) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, str val): cdef: @@ -997,11 +937,9 @@ cdef class PyObjectHashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) - for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) - for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys - sizeof(Py_ssize_t)) # vals - return overhead + for_flags + for_pairs + return self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(Py_ssize_t) + # vals + sizeof(uint32_t)) # flags cpdef get_item(self, object val): cdef: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 7c5afa4ff6b27..4a466ada765ca 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -8,16 +8,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), - ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), - ('uint32', 'uint32', 'uint32_t'), - ('uint16', 'uint16', 'uint16_t'), - ('uint8', 'uint8', 'uint8_t'), ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t'), - ('int32', 'int32', 'int32_t'), - ('int16', 'int16', 'int16_t'), - ('int8', 'int8', 'int8_t')] + ('int64', 'int64', 'int64_t')] }} @@ -61,7 +54,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - {{if dtype == 'float64' or dtype == 'float32'}} + {{if dtype == 'float64'}} if val == val or not dropna: {{else}} if True: @@ -282,15 +275,8 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), - ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), - ('int32', 'int32_t', 'int32', 'int32'), - ('int16', 'int16_t', 'int16', 'int16'), - ('int8', 'int8_t', 'int8', 'int8'), ('uint64', 'uint64_t', 'uint64', 'uint64'), - ('uint32', 'uint32_t', 'uint32', 'uint32'), - ('uint16', 'uint16_t', 'uint16', 'uint16'), - ('uint8', 'uint8_t', 'uint8', 'uint8'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 69680e472bbc2..c7b67667bda17 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -10,21 +10,21 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype -dtypes = [('Float64', 'float64'), - ('Float32', 'float32'), - ('Int64', 'int64'), - ('Int32', 'int32'), - ('Int16', 'int16'), - ('Int8', 'int8'), - ('UInt64', 'uint64'), - ('UInt32', 'uint32'), - ('UInt16', 'uint16'), - ('UInt8', 'uint8'), +# name, dtype, hashtable_name +dtypes = [('Float64', 'float64', 'Float64'), + ('Float32', 'float32', 'Float64'), + ('Int64', 'int64', 'Int64'), + ('Int32', 'int32', 'Int64'), + ('Int16', 'int16', 'Int64'), + ('Int8', 'int8', 'Int64'), + ('UInt64', 'uint64', 'UInt64'), + ('UInt32', 'uint32', 'UInt64'), + ('UInt16', 'uint16', 'UInt64'), + ('UInt8', 'uint8', 'UInt64'), ] }} -{{for name, dtype in dtypes}} +{{for name, dtype, hashtable_name in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -32,7 +32,7 @@ cdef class {{name}}Engine(IndexEngine): # returns an ndarray with dtype {{dtype}}_t cdef _make_hash_table(self, Py_ssize_t n): - return _hash.{{name}}HashTable(n) + return _hash.{{hashtable_name}}HashTable(n) {{if name not in {'Float64', 'Float32'} }} cdef _check_type(self, object val): @@ -41,7 +41,9 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} cdef void _call_map_locations(self, values): - self.mapping.map_locations(algos.ensure_{{name.lower()}}(values)) + # self.mapping is of type {{hashtable_name}}HashTable, + # so convert dtype of values + self.mapping.map_locations(algos.ensure_{{hashtable_name.lower()}}(values)) cdef _maybe_get_bool_indexer(self, object val): cdef: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 10becdce5d6dd..f8bcbcfb158b5 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -179,8 +179,7 @@ cdef class IntervalMixin: return (self.right == self.left) & (self.closed != 'both') def _check_closed_matches(self, other, name='other'): - """ - Check if the closed attribute of `other` matches. + """Check if the closed attribute of `other` matches. Note that 'left' and 'right' are considered different from 'both'. diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 0d0c5ae058b21..1bb3a158b4b1a 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,21 +1,8 @@ from cpython.object cimport PyObject -from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, - int64_t, - uint8_t, - uint16_t, - uint32_t, - uint64_t, -) +from numpy cimport float64_t, int32_t, int64_t, uint32_t, uint64_t cdef extern from "khash_python.h": - const int KHASH_TRACE_DOMAIN - ctypedef uint32_t khint_t ctypedef khint_t khiter_t @@ -80,6 +67,72 @@ cdef extern from "khash_python.h": void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil + ctypedef struct kh_int64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int64_t *keys + size_t *vals + + kh_int64_t* kh_init_int64() nogil + void kh_destroy_int64(kh_int64_t*) nogil + void kh_clear_int64(kh_int64_t*) nogil + khint_t kh_get_int64(kh_int64_t*, int64_t) nogil + void kh_resize_int64(kh_int64_t*, khint_t) nogil + khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil + void kh_del_int64(kh_int64_t*, khint_t) nogil + + bint kh_exist_int64(kh_int64_t*, khiter_t) nogil + + ctypedef uint64_t khuint64_t + + ctypedef struct kh_uint64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + khuint64_t *keys + size_t *vals + + kh_uint64_t* kh_init_uint64() nogil + void kh_destroy_uint64(kh_uint64_t*) nogil + void kh_clear_uint64(kh_uint64_t*) nogil + khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil + void kh_resize_uint64(kh_uint64_t*, khint_t) nogil + khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil + void kh_del_uint64(kh_uint64_t*, khint_t) nogil + + bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil + + ctypedef struct kh_float64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + float64_t *keys + size_t *vals + + kh_float64_t* kh_init_float64() nogil + void kh_destroy_float64(kh_float64_t*) nogil + void kh_clear_float64(kh_float64_t*) nogil + khint_t kh_get_float64(kh_float64_t*, float64_t) nogil + void kh_resize_float64(kh_float64_t*, khint_t) nogil + khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil + void kh_del_float64(kh_float64_t*, khint_t) nogil + + bint kh_exist_float64(kh_float64_t*, khiter_t) nogil + + ctypedef struct kh_int32_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int32_t *keys + size_t *vals + + kh_int32_t* kh_init_int32() nogil + void kh_destroy_int32(kh_int32_t*) nogil + void kh_clear_int32(kh_int32_t*) nogil + khint_t kh_get_int32(kh_int32_t*, int32_t) nogil + void kh_resize_int32(kh_int32_t*, khint_t) nogil + khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil + void kh_del_int32(kh_int32_t*, khint_t) nogil + + bint kh_exist_int32(kh_int32_t*, khiter_t) nogil + # sweep factorize ctypedef struct kh_strbox_t: @@ -97,5 +150,3 @@ cdef extern from "khash_python.h": void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil - -include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in deleted file mode 100644 index db8d3e0b19417..0000000000000 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ /dev/null @@ -1,42 +0,0 @@ -""" -Template for wrapping khash-tables for each primitive `dtype` - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -{{py: - -# name, c_type -primitive_types = [('int64', 'int64_t'), - ('uint64', 'uint64_t'), - ('float64', 'float64_t'), - ('int32', 'int32_t'), - ('uint32', 'uint32_t'), - ('float32', 'float32_t'), - ('int16', 'int16_t'), - ('uint16', 'uint16_t'), - ('int8', 'int8_t'), - ('uint8', 'uint8_t'), - ] -}} - -{{for name, c_type in primitive_types}} - -cdef extern from "khash_python.h": - ctypedef struct kh_{{name}}_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - {{c_type}} *keys - size_t *vals - - kh_{{name}}_t* kh_init_{{name}}() nogil - void kh_destroy_{{name}}(kh_{{name}}_t*) nogil - void kh_clear_{{name}}(kh_{{name}}_t*) nogil - khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil - void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil - khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil - void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil - - bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil - -{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ca18bae4e2c4..0b0334d52c1e9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -118,8 +118,6 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: def is_scalar(val: object) -> bool: """ - Return True if given object is scalar. - Parameters ---------- val : object @@ -636,7 +634,7 @@ cpdef ndarray[object] ensure_string_array( ---------- arr : array-like The values to be converted to str, if needed. - na_value : Any, default np.nan + na_value : Any The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. @@ -929,8 +927,6 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, def is_float(obj: object) -> bool: """ - Return True if given object is float. - Returns ------- bool @@ -940,8 +936,6 @@ def is_float(obj: object) -> bool: def is_integer(obj: object) -> bool: """ - Return True if given object is integer. - Returns ------- bool @@ -951,8 +945,6 @@ def is_integer(obj: object) -> bool: def is_bool(obj: object) -> bool: """ - Return True if given object is boolean. - Returns ------- bool @@ -962,8 +954,6 @@ def is_bool(obj: object) -> bool: def is_complex(obj: object) -> bool: """ - Return True if given object is complex. - Returns ------- bool @@ -981,7 +971,7 @@ cpdef bint is_interval(object obj): def is_period(val: object) -> bool: """ - Return True if given object is Period. + Return a boolean if this is a Period object. Returns ------- diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index ad6329c588bbe..9459cd297c758 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -44,9 +44,7 @@ cdef class _BaseGrouper: Slider islider, Slider vslider): if cached_typ is None: cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ( - vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name - ) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index bb56b2fe2d145..916838d1e9584 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -115,24 +115,6 @@ int main() { #include "../inline_helper.h" -// hooks for memory allocator, C-runtime allocator used per default -#ifndef KHASH_MALLOC -#define KHASH_MALLOC malloc -#endif - -#ifndef KHASH_REALLOC -#define KHASH_REALLOC realloc -#endif - -#ifndef KHASH_CALLOC -#define KHASH_CALLOC calloc -#endif - -#ifndef KHASH_FREE -#define KHASH_FREE free -#endif - - #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu @@ -140,23 +122,14 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khint64_t; +typedef unsigned long khuint64_t; +typedef signed long khint64_t; #else -typedef unsigned long long khint64_t; -#endif - -#if UINT_MAX == 0xffffu -typedef unsigned int khint16_t; -#elif USHRT_MAX == 0xffffu -typedef unsigned short khint16_t; -#endif - -#if UCHAR_MAX == 0xffu -typedef unsigned char khint8_t; +typedef unsigned long long khuint64_t; +typedef signed long long khint64_t; #endif typedef double khfloat64_t; -typedef float khfloat32_t; typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -170,86 +143,10 @@ typedef khint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) - -// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp -khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){ - const khint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khint32_t h = SEED ^ 4; - - //handle 4 bytes: - k *= M_32; - k ^= k >> R_32; - k *= M_32; - - h *= M_32; - h ^= k; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. (Really needed here?) - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; -} - -// it is possible to have a special x64-version, which would need less operations, but -// using 32bit version always has also some benifits: -// - one code for 32bit and 64bit builds -// - the same case for 32bit and 64bit builds -// - no performance difference could be measured compared to a possible x64-version - -khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){ - const khint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khint32_t h = SEED ^ 4; - - //handle first 4 bytes: - k1 *= M_32; - k1 ^= k1 >> R_32; - k1 *= M_32; - - h *= M_32; - h ^= k1; - - //handle second 4 bytes: - k2 *= M_32; - k2 ^= k2 >> R_32; - k2 *= M_32; - - h *= M_32; - h ^= k2; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; -} - -khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){ - khint32_t k1 = (khint32_t)k; - khint32_t k2 = (khint32_t)(k >> 32); - - return murmur2_32_32to32(k1, k2); -} - - #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else -#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) @@ -283,14 +180,14 @@ static const double __ac_HASH_UPPER = 0.77; khval_t *vals; \ } kh_##name##_t; \ SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ - KHASH_FREE(h->vals); \ - KHASH_FREE(h); \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ @@ -323,11 +220,11 @@ static const double __ac_HASH_UPPER = 0.77; if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ @@ -360,10 +257,10 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - KHASH_FREE(h->flags); /* free the working space */ \ + free(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ @@ -615,25 +512,15 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ - -// we implicitly convert signed int to unsigned int, thus potential overflows -// for operations (<<,*,+) don't trigger undefined behavior, also >>-operator -// is implementation defined for signed ints if sign-bit is set. -// because we never really "get" the keys, there will be no convertion from -// unsigend int to (signed) int (which would be implementation defined behavior) -// this holds also for 64-, 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) -#define KHASH_MAP_INIT_UINT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -644,34 +531,11 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) -/*! @function - @abstract Instantiate a hash map containing 16bit-integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT16(name, khval_t) \ - KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -#define KHASH_MAP_INIT_UINT16(name, khval_t) \ - KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 8bit-integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT8(name, khval_t) \ - KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -#define KHASH_MAP_INIT_UINT8(name, khval_t) \ - KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - - typedef const char *kh_cstr_t; /*! @function @@ -694,23 +558,12 @@ typedef const char *kh_cstr_t; #define kh_exist_float64(h, k) (kh_exist(h, k)) #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) -#define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) -#define kh_exist_uint32(h, k) (kh_exist(h, k)) -#define kh_exist_int16(h, k) (kh_exist(h, k)) -#define kh_exist_uint16(h, k) (kh_exist(h, k)) -#define kh_exist_int8(h, k) (kh_exist(h, k)) -#define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) -KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) -KHASH_MAP_INIT_INT16(int16, size_t) -KHASH_MAP_INIT_UINT16(uint16, size_t) -KHASH_MAP_INIT_INT8(int8, size_t) -KHASH_MAP_INIT_UINT8(uint8, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 8e4e61b4f3077..2b46d30c3adb6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,59 +1,6 @@ #include #include -// khash should report usage to tracemalloc -#if PY_VERSION_HEX >= 0x03060000 -#include -#if PY_VERSION_HEX < 0x03070000 -#define PyTraceMalloc_Track _PyTraceMalloc_Track -#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack -#endif -#else -#define PyTraceMalloc_Track(...) -#define PyTraceMalloc_Untrack(...) -#endif - - -static const int KHASH_TRACE_DOMAIN = 424242; -void *traced_malloc(size_t size){ - void * ptr = malloc(size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); - } - return ptr; -} - -void *traced_calloc(size_t num, size_t size){ - void * ptr = calloc(num, size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size); - } - return ptr; -} - -void *traced_realloc(void* old_ptr, size_t size){ - void * ptr = realloc(old_ptr, size); - if(ptr!=NULL){ - if(old_ptr != ptr){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); - } - return ptr; -} - -void traced_free(void* ptr){ - if(ptr!=NULL){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); - } - free(ptr); -} - - -#define KHASH_MALLOC traced_malloc -#define KHASH_REALLOC traced_realloc -#define KHASH_CALLOC traced_calloc -#define KHASH_FREE traced_free #include "khash.h" // Previously we were using the built in cpython hash function for doubles @@ -66,68 +13,33 @@ void traced_free(void* ptr){ // is 64 bits the truncation causes collission issues. Given all that, we use our own // simple hash, viewing the double bytes as an int64 and using khash's default // hash for 64 bit integers. -// GH 13436 showed that _Py_HashDouble doesn't work well with khash -// GH 28303 showed, that the simple xoring-version isn't good enough -// See GH 36729 for evaluation of the currently used murmur2-hash version -// An interesting alternative to expensive murmur2-hash would be to change -// the probing strategy and use e.g. the probing strategy from CPython's -// implementation of dicts, which shines for smaller sizes but is more -// predisposed to superlinear running times (see GH 36729 for comparison) - - +// GH 13436 khint64_t PANDAS_INLINE asint64(double key) { - khint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; -} - -khint32_t PANDAS_INLINE asint32(float key) { - khint32_t val; - memcpy(&val, &key, sizeof(float)); - return val; + khint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } -#define ZERO_HASH 0 -#define NAN_HASH 0 +// correct for all inputs but not -0.0 and NaNs +#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) -khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khint64_t as_int = asint64(val); - return murmur2_64to32(as_int); -} +// correct for all inputs but not NaNs +#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \ + kh_float64_hash_func_0_NAN(0.0) : \ + kh_float64_hash_func_0_NAN(key)) -khint32_t PANDAS_INLINE kh_float32_hash_func(float val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0f){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khint32_t as_int = asint32(val); - return murmur2_32to32(as_int); -} +// correct for all +#define kh_float64_hash_func(key) ((key) != (key) ? \ + kh_float64_hash_func_NAN(Py_NAN) : \ + kh_float64_hash_func_NAN(key)) -#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) +#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) -#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ - KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) - -KHASH_MAP_INIT_FLOAT32(float32, size_t) - int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); @@ -181,7 +93,7 @@ typedef struct { typedef kh_str_starts_t* p_kh_str_starts_t; p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { - kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); + kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t)); result->table = kh_init_str(); return result; } @@ -204,7 +116,7 @@ khint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { kh_destroy_str(table->table); - KHASH_FREE(table); + free(table); } void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 88144330c1fe9..df8ec68986ccb 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -159,7 +159,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = (char *)malloc(STREAM_INIT_SIZE * sizeof(char)); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -170,16 +170,16 @@ int parser_init(parser_t *self) { // word pointers and metadata sz = STREAM_INIT_SIZE / 10; sz = sz ? sz : 1; - self->words = malloc(sz * sizeof(char *)); - self->word_starts = malloc(sz * sizeof(int64_t)); + self->words = (char **)malloc(sz * sizeof(char *)); + self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t)); self->max_words_cap = sz; self->words_cap = sz; self->words_len = 0; // line pointers and metadata - self->line_start = malloc(sz * sizeof(int64_t)); + self->line_start = (int64_t *)malloc(sz * sizeof(int64_t)); - self->line_fields = malloc(sz * sizeof(int64_t)); + self->line_fields = (int64_t *)malloc(sz * sizeof(int64_t)); self->lines_cap = sz; self->lines = 0; @@ -345,7 +345,7 @@ static int push_char(parser_t *self, char c) { "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; @@ -362,7 +362,7 @@ int PANDAS_INLINE end_field(parser_t *self) { "self->words_cap(%zu)\n", self->words_len, self->words_cap)) int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; @@ -398,7 +398,7 @@ static void append_warning(parser_t *self, const char *msg) { void *newptr; if (self->warn_msg == NULL) { - self->warn_msg = malloc(length + 1); + self->warn_msg = (char *)malloc(length + 1); snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); @@ -459,10 +459,10 @@ static int end_line(parser_t *self) { // file_lines is now the actual file line number (starting at 1) if (self->error_bad_lines) { - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", - ex_fields, self->file_lines, fields); + "Expected %d fields in line %lld, saw %lld\n", + ex_fields, (long long)self->file_lines, (long long)fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -471,10 +471,11 @@ static int end_line(parser_t *self) { // simply skip bad lines if (self->warn_bad_lines) { // pass up error message - msg = malloc(bufsize); + msg = (char *)malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %" PRIu64 ": expected %d fields, saw %" - PRId64 "\n", self->file_lines, ex_fields, fields); + "Skipping line %lld: expected %d fields, saw %lld\n", + (long long)self->file_lines, ex_fields, + (long long)fields); append_warning(self, msg); free(msg); } @@ -486,7 +487,7 @@ static int end_line(parser_t *self) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -507,7 +508,7 @@ static int end_line(parser_t *self) { "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " "possible malformed input file.\n"); @@ -568,7 +569,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { if (status != REACHED_EOF && self->data == NULL) { int64_t bufsize = 200; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); if (status == CALLING_READ_FAILED) { snprintf(self->error_msg, bufsize, @@ -599,7 +600,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ int64_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ + self->error_msg = (char *)malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n");\ return PARSER_OUT_OF_MEMORY; \ @@ -729,7 +730,7 @@ int tokenize_bytes(parser_t *self, if (make_stream_space(self, self->datalen - self->datapos) < 0) { int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -1036,7 +1037,7 @@ int tokenize_bytes(parser_t *self, self->state = IN_FIELD; } else { int64_t bufsize = 100; - self->error_msg = malloc(bufsize); + self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "delimiter expected after quote in quote"); goto parsingerror; @@ -1149,8 +1150,8 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, - self->file_lines); + "EOF inside string starting at row %lld", + (long long)self->file_lines); return -1; case ESCAPED_CHAR: @@ -1202,7 +1203,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move stream, only if something to move */ if (char_count < self->stream_len) { - memmove(self->stream, (self->stream + char_count), + memmove((void *)self->stream, (void *)(self->stream + char_count), self->stream_len - char_count); } /* buffer counts */ @@ -1268,16 +1269,20 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - self->words = realloc(self->words, new_cap * sizeof(char *)); - if (self->words == NULL) { + newptr = realloc((void *)self->words, new_cap * sizeof(char *)); + if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; + } else { + self->words = (char **)newptr; } - self->word_starts = realloc(self->word_starts, - new_cap * sizeof(int64_t)); - if (self->word_starts == NULL) { + newptr = realloc((void *)self->word_starts, + new_cap * sizeof(int64_t)); + if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; + } else { + self->word_starts = (int64_t *)newptr; + self->words_cap = new_cap; } - self->words_cap = new_cap; } /* trim stream */ @@ -1290,7 +1295,7 @@ int parser_trim_buffers(parser_t *self) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "realloc\n")); - newptr = realloc(self->stream, new_cap); + newptr = realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1316,19 +1321,19 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, + newptr = realloc((void *)self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = newptr; + self->line_start = (int64_t *)newptr; } - newptr = realloc(self->line_fields, + newptr = realloc((void *)self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = newptr; + self->line_fields = (int64_t *)newptr; self->lines_cap = new_cap; } } @@ -1823,14 +1828,14 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, if (endpc == pc + strlen(pc)) { if (q != NULL) { // report endptr from source string (p) - *q = endptr; + *q = (char *) endptr; } } else { *error = -1; if (q != NULL) { // p and pc are different len due to tsep removal. Can't report // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior + *q = (char *)p; } } if (maybe_int != NULL) *maybe_int = 0; @@ -1858,7 +1863,7 @@ int uint64_conflict(uint_state *self) { int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - const char *p = p_item; + const char *p = (const char *)p_item; int isneg = 0; int64_t number = 0; int d; @@ -1978,7 +1983,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - const char *p = p_item; + const char *p = (const char *)p_item; uint64_t pre_max = uint_max / 10; int dig_pre_max = uint_max % 10; uint64_t number = 0; diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 1339dee954603..dbd094905cf24 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1403,19 +1403,6 @@ cdef class BusinessDay(BusinessMixin): cdef class BusinessHour(BusinessMixin): """ DateOffset subclass representing possibly n business hours. - - Parameters - ---------- - n : int, default 1 - The number of months represented. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar``. - start : str, default "09:00" - Start time of your custom business hour in 24h format. - end : str, default: "17:00" - End time of your custom business hour in 24h format. """ _prefix = "BH" @@ -3264,19 +3251,6 @@ cdef class CustomBusinessDay(BusinessDay): cdef class CustomBusinessHour(BusinessHour): """ DateOffset subclass representing possibly n custom business days. - - Parameters - ---------- - n : int, default 1 - The number of months represented. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar``. - start : str, default "09:00" - Start time of your custom business hour in 24h format. - end : str, default: "17:00" - End time of your custom business hour in 24h format. """ _prefix = "CBH" diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e4b19d844dcab..29e8c58055f9e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -405,11 +405,9 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: m = 10**(3 -len(frac)) * 1000 * 1000 elif len(frac) > 3 and len(frac) <= 6: m = 10**(6 -len(frac)) * 1000 - elif len(frac) > 6 and len(frac) <= 9: - m = 10**(9 -len(frac)) else: - m = 1 - frac = frac[:9] + m = 10**(9 -len(frac)) + r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -1145,9 +1143,6 @@ class Timedelta(_Timedelta): Notes ----- The ``.value`` attribute is always in ns. - - If the precision is higher than nanoseconds, the precision of the duration is - truncated to nanoseconds. """ def __new__(cls, object value=_no_input, unit=None, **kwargs): diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 1049682af08e8..f08a86b1262e6 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -426,7 +426,7 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): int64 ndarray of converted """ cdef: - const int64_t[:] converted + int64_t[:] converted if len(vals) == 0: return np.array([], dtype=np.int64) @@ -437,7 +437,7 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @cython.boundscheck(False) @cython.wraparound(False) -cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): +cdef int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. @@ -459,7 +459,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): str typ if is_utc(tz): - return vals + converted = vals.copy() elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) for i in range(n): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 54a09a6d2ede7..4de7a5860c465 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -136,7 +136,7 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, sum_x[0] = t -def roll_sum(const float64_t[:] values, ndarray[int64_t] start, +def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 @@ -240,7 +240,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(const float64_t[:] values, ndarray[int64_t] start, +def roll_mean(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0 @@ -361,7 +361,7 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, ssqdm_x[0] = 0 -def roll_var(const float64_t[:] values, ndarray[int64_t] start, +def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int ddof=1): """ Numerically stable implementation using Welford's method. @@ -772,7 +772,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # Rolling median, min, max -def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, +def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): # GH 32865. win argument kept for compatibility cdef: @@ -1032,7 +1032,7 @@ interpolation_types = { } -def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, +def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, float64_t quantile, str interpolation): """ @@ -1496,8 +1496,8 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, - int minp, ndarray[int64_t] times, int64_t halflife): +def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times, + int64_t halflife): """ Compute exponentially-weighted moving average using halflife and time distances. @@ -1505,8 +1505,6 @@ def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, Parameters ---------- vals : ndarray[float_64] - start: ndarray[int_64] - end: ndarray[int_64] minp : int times : ndarray[int64] halflife : int64 @@ -1554,20 +1552,17 @@ def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, return output -def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, - float64_t com, bint adjust, bint ignore_na): +def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. Parameters ---------- vals : ndarray (float64 type) - start: ndarray (int64 type) - end: ndarray (int64 type) - minp : int com : float64 adjust : int ignore_na : bool + minp : int Returns ------- @@ -1625,21 +1620,19 @@ def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, # Exponentially weighted moving covariance -def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, - float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias): +def ewmcov(float64_t[:] input_x, float64_t[:] input_y, + float64_t com, bint adjust, bint ignore_na, int minp, bint bias): """ Compute exponentially-weighted moving variance using center-of-mass. Parameters ---------- input_x : ndarray (float64 type) - start: ndarray (int64 type) - end: ndarray (int64 type) - minp : int input_y : ndarray (float64 type) com : float64 adjust : int ignore_na : bool + minp : int bias : int Returns diff --git a/pandas/_testing.py b/pandas/_testing.py index 68371b782aac2..5dcd1247e52ba 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -739,29 +739,22 @@ def assert_index_equal( obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. - - Examples - -------- - >>> from pandas.testing import assert_index_equal - >>> a = pd.Index([1, 2, 3]) - >>> b = pd.Index([1, 2, 3]) - >>> assert_index_equal(a, b) """ __tracebackhide__ = True - def _check_types(left, right, obj="Index"): + def _check_types(l, r, obj="Index"): if exact: - assert_class_equal(left, right, exact=exact, obj=obj) + assert_class_equal(l, r, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal("dtype", left, right, obj=obj) + assert_attr_equal("dtype", l, r, obj=obj) # allow string-like to have different inferred_types - if left.inferred_type in ("string"): - assert right.inferred_type in ("string") + if l.inferred_type in ("string"): + assert r.inferred_type in ("string") else: - assert_attr_equal("inferred_type", left, right, obj=obj) + assert_attr_equal("inferred_type", l, r, obj=obj) def _get_ilevel_values(index, level): # accept level number only @@ -1147,9 +1140,9 @@ def _raise(left, right, err_msg): ) diff = 0 - for left_arr, right_arr in zip(left, right): + for l, r in zip(left, right): # count up differences - if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan): + if not array_equivalent(l, r, strict_nan=strict_nan): diff += 1 diff = diff * 100.0 / left.size @@ -1212,13 +1205,6 @@ def assert_extension_array_equal( Missing values are checked separately from valid values. A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. - - Examples - -------- - >>> from pandas.testing import assert_extension_array_equal - >>> a = pd.Series([1, 2, 3, 4]) - >>> b, c = a.array, a.array - >>> assert_extension_array_equal(b, c) """ if check_less_precise is not no_default: warnings.warn( @@ -1348,13 +1334,6 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. - - Examples - -------- - >>> from pandas.testing import assert_series_equal - >>> a = pd.Series([1, 2, 3, 4]) - >>> b = pd.Series([1, 2, 3, 4]) - >>> assert_series_equal(a, b) """ __tracebackhide__ = True @@ -1768,7 +1747,7 @@ def box_expected(expected, box_cls, transpose=True): elif box_cls is pd.DataFrame: expected = pd.Series(expected).to_frame() if transpose: - # for vector operations, we need a DataFrame to be a single-row, + # for vector operations, we we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. expected = expected.T diff --git a/pandas/_typing.py b/pandas/_typing.py index 7f01bcaa1c50e..55a1c17b0aa53 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -146,5 +146,10 @@ CompressionOptions = Optional[Union[str, CompressionDict]] +# let's bind types +ModeVar = TypeVar("ModeVar", str, None, Optional[str]) +EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) + + # type of float formatter in DataFrameFormatter FloatFormatType = Union[str, Callable, "EngFormatter"] diff --git a/pandas/_version.py b/pandas/_version.py index 14c2b5c6e7603..d2df063ff3acf 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -5,36 +5,31 @@ # that just contains the computed version number. # This file is released into the public domain. Generated by -# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) - -"""Git implementation of _version.py.""" +# versioneer-0.15 (https://github.com/warner/python-versioneer) import errno import os import re import subprocess import sys +from typing import Callable, Dict def get_keywords(): - """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" - git_date = "$Format:%ci$" - keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} - return keywords + return {"refnames": git_refnames, "full": git_full} class VersioneerConfig: - """Container for Versioneer configuration parameters.""" + pass def get_config(): - """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() @@ -48,17 +43,14 @@ def get_config(): class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - + pass -HANDLERS = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator - """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator + def decorate(f: Callable) -> Callable: if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f @@ -67,8 +59,7 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): - """Call the given command(s).""" +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): assert isinstance(commands, list) p = None for c in commands: @@ -78,7 +69,6 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= p = subprocess.Popen( [c] + args, cwd=cwd, - env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), ) @@ -88,77 +78,58 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= if e.errno == errno.ENOENT: continue if verbose: - print("unable to run %s" % dispcmd) + print(f"unable to run {dispcmd}") print(e) - return None, None + return None else: if verbose: print(f"unable to find command, tried {commands}") - return None, None + return None stdout = p.communicate()[0].strip().decode() if p.returncode != 0: if verbose: - print("unable to run %s (error)" % dispcmd) - print("stdout was %s" % stdout) - return None, p.returncode - return stdout, p.returncode + print(f"unable to run {dispcmd} (error)") + return None + return stdout def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for i in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - "date": None, - } - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + # Source tarballs conventionally unpack into a directory that includes + # both the project name and a version string. + dirname = os.path.basename(root) + if not dirname.startswith(parentdir_prefix): + if verbose: + print( + f"guessing rootdir is '{root}', but '{dirname}' " + f"doesn't start with prefix '{parentdir_prefix}'" + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + } @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: - f = open(versionfile_abs) - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() + with open(versionfile_abs) as fd: + for line in fd.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) except OSError: pass return keywords @@ -166,22 +137,8 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") - date = keywords.get("date") - if date is not None: - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - - # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: @@ -202,21 +159,20 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print("discarding '%s', no digits" % ",".join(refs - tags)) + print(f"discarding '{','.join(refs - tags)}', no digits") if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) + print(f"likely tags: {','.join(sorted(tags))}") for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: - print("picking %s" % r) + print(f"picking {r}") return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, - "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: @@ -226,48 +182,34 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", - "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - """Get version from 'git describe' in the root of the source tree. + # this runs 'git' from the root of the source tree. This only gets called + # if the git-archive 'subst' keywords were *not* expanded, and + # _version.py hasn't already been rewritten with a short version string, + # meaning we're inside a checked out source tree. + + if not os.path.exists(os.path.join(root, ".git")): + if verbose: + print(f"no .git in {root}") + raise NotThisMethod("no .git directory") - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) - if rc != 0: - if verbose: - print("Directory %s not under git control" % root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command( - GITS, - [ - "describe", - "--tags", - "--dirty", - "--always", - "--long", - "--match", - "%s*" % tag_prefix, - ], - cwd=root, + # if there is a tag, this yields TAG-NUM-gHEX[-dirty] + # if there are no tags, this yields HEX[-dirty] (no NUM) + describe_out = run_command( + GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() @@ -294,20 +236,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + pieces["error"] = f"unable to parse git-describe output: '{describe_out}'" return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): + msg = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( - full_tag, - tag_prefix, - ) + print(msg) + pieces["error"] = msg return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag @@ -319,129 +259,110 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() - # Use only the last line. Previous lines may contain GPG signature - # information. - date = date.splitlines()[-1] - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - return pieces def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". + # now build up version string, with post-release "local version + # identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + # get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + # exceptions: + # 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" + rendered += f"{pieces['distance']:d}.g{pieces['short']}" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" + rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" + if pieces["dirty"]: + rendered += ".dirty" return rendered def render_pep440_pre(pieces): - """TAG[.post0.devDISTANCE] -- No -dirty. + # TAG[.post.devDISTANCE] . No -dirty + + # exceptions: + # 1: no tags. 0.post.devDISTANCE - Exceptions: - 1: no tags. 0.post0.devDISTANCE - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += ".post0.dev%d" % pieces["distance"] + rendered += f".post.dev{pieces['distance']:d}" else: # exception #1 - rendered = "0.post0.dev%d" % pieces["distance"] + rendered = f"0.post.dev{pieces['distance']:d}" return rendered def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . + # TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that + # .dev0 sorts backwards (a dirty tree will appear "older" than the + # corresponding clean one), but you shouldn't be releasing software with + # -dirty anyways. - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. + # exceptions: + # 1: no tags. 0.postDISTANCE[.dev0] - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] + rendered += f".post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] + rendered += f"g{pieces['short']}" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] + rendered = f"0.pos{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" - rendered += "+g%s" % pieces["short"] + rendered += f"+g{pieces['short']}" return rendered def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . + # TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. - The ".dev0" means dirty. + # exceptions: + # 1: no tags. 0.postDISTANCE[.dev0] - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" + rendered += f".post{pieces['distance']:d}" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" + rendered = f"0.post{pieces['distance']:d}" + if pieces["dirty"]: + rendered += ".dev0" return rendered def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. + # TAG[-DISTANCE-gHEX][-dirty], like 'git describe --tags --dirty + # --always' - Like 'git describe --tags --dirty --always'. + # exceptions: + # 1: no tags. HEX[-dirty] (note: no 'g' prefix) - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -451,17 +372,15 @@ def render_git_describe(pieces): def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. + # TAG-DISTANCE-gHEX[-dirty], like 'git describe --tags --dirty + # --always -long'. The distance/hash is unconditional. - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. + # exceptions: + # 1: no tags. HEX[-dirty] (note: no 'g' prefix) - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -471,14 +390,12 @@ def render_git_describe_long(pieces): def render(pieces, style): - """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], - "date": None, } if not style or style == "default": @@ -497,19 +414,17 @@ def render(pieces, style): elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: - raise ValueError("unknown style '%s'" % style) + raise ValueError(f"unknown style '{style}'") return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date"), } def get_versions(): - """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which @@ -536,7 +451,6 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", - "date": None, } try: @@ -556,5 +470,4 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to compute version", - "date": None, } diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 533e67acfa2f4..d3c7888cac704 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -25,7 +25,7 @@ "sqlalchemy": "1.2.8", "tables": "3.5.1", "tabulate": "0.8.3", - "xarray": "0.12.3", + "xarray": "0.12.0", "xlrd": "1.2.0", "xlwt": "1.3.0", "xlsxwriter": "1.0.2", diff --git a/pandas/conftest.py b/pandas/conftest.py index a0ec6f96042fc..b2daa2c5bc3f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -33,10 +33,8 @@ import pandas.util._test_decorators as td -from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype - import pandas as pd -from pandas import DataFrame, Interval, Period, Series, Timedelta, Timestamp +from pandas import DataFrame, Series import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex @@ -472,8 +470,8 @@ def index_with_missing(request): if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: # For setting missing values in the top level of MultiIndex vals = ind.tolist() - vals[0] = (None,) + vals[0][1:] - vals[-1] = (None,) + vals[-1][1:] + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] return MultiIndex.from_tuples(vals) else: vals[0] = None @@ -689,26 +687,6 @@ def float_frame(): return DataFrame(tm.getSeriesData()) -# ---------------------------------------------------------------- -# Scalars -# ---------------------------------------------------------------- -@pytest.fixture( - params=[ - (Interval(left=0, right=5), IntervalDtype("int64")), - (Interval(left=0.1, right=0.5), IntervalDtype("float64")), - (Period("2012-01", freq="M"), "period[M]"), - (Period("2012-02-01", freq="D"), "period[D]"), - ( - Timestamp("2011-01-01", tz="US/Eastern"), - DatetimeTZDtype(tz="US/Eastern"), - ), - (Timedelta(seconds=500), "timedelta64[ns]"), - ] -) -def ea_scalar_and_dtype(request): - return request.param - - # ---------------------------------------------------------------- # Operators & Operations # ---------------------------------------------------------------- @@ -1165,26 +1143,6 @@ def any_nullable_int_dtype(request): return request.param -@pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) -def any_numeric_dtype(request): - """ - Parameterized fixture for any nullable integer dtype and - any float ea dtypes. - - * 'UInt8' - * 'Int8' - * 'UInt16' - * 'Int16' - * 'UInt32' - * 'Int32' - * 'UInt64' - * 'Int64' - * 'Float32' - * 'Float64' - """ - return request.param - - @pytest.fixture(params=tm.SIGNED_EA_INT_DTYPES) def any_signed_nullable_int_dtype(request): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7bae912a070a9..ec88eb817b3f8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -49,9 +49,9 @@ ) from pandas.core.dtypes.generic import ( ABCExtensionArray, + ABCIndex, ABCIndexClass, ABCMultiIndex, - ABCRangeIndex, ABCSeries, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Index, Series + from pandas import Categorical, DataFrame, Series _shared_docs: Dict[str, str] = {} @@ -69,7 +69,7 @@ # dtype access # # --------------- # def _ensure_data( - values: ArrayLike, dtype: Optional[DtypeObj] = None + values, dtype: Optional[DtypeObj] = None ) -> Tuple[np.ndarray, DtypeObj]: """ routine to ensure that our data is of the correct @@ -95,12 +95,6 @@ def _ensure_data( pandas_dtype : np.dtype or ExtensionDtype """ - if dtype is not None: - # We only have non-None dtype when called from `isin`, and - # both Datetimelike and Categorical dispatch before getting here. - assert not needs_i8_conversion(dtype) - assert not is_categorical_dtype(dtype) - if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) @@ -137,20 +131,21 @@ def _ensure_data( return ensure_object(values), np.dtype("object") # datetimelike - if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): - if is_period_dtype(values.dtype) or is_period_dtype(dtype): + vals_dtype = getattr(values, "dtype", None) + if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): + if is_period_dtype(vals_dtype) or is_period_dtype(dtype): from pandas import PeriodIndex - values = PeriodIndex(values)._data + values = PeriodIndex(values) dtype = values.dtype - elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex - values = TimedeltaIndex(values)._data + values = TimedeltaIndex(values) dtype = values.dtype else: # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): + if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs @@ -160,12 +155,12 @@ def _ensure_data( from pandas import DatetimeIndex - values = DatetimeIndex(values)._data + values = DatetimeIndex(values) dtype = values.dtype return values.asi8, dtype - elif is_categorical_dtype(values.dtype) and ( + elif is_categorical_dtype(vals_dtype) and ( is_categorical_dtype(dtype) or dtype is None ): values = values.codes @@ -223,8 +218,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string", "mixed-integer"]: - # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 + if inferred in ["mixed", "string"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -242,11 +236,11 @@ def _ensure_arraylike(values): } -def _get_hashtable_algo(values: np.ndarray): +def _get_hashtable_algo(values): """ Parameters ---------- - values : np.ndarray + values : arraylike Returns ------- @@ -260,15 +254,15 @@ def _get_hashtable_algo(values: np.ndarray): return htable, values -def _get_values_for_rank(values: ArrayLike): +def _get_values_for_rank(values): if is_categorical_dtype(values): - values = cast("Categorical", values)._values_for_rank() + values = values._values_for_rank() values, _ = _ensure_data(values) return values -def get_data_algo(values: ArrayLike): +def get_data_algo(values): values = _get_values_for_rank(values) ndtype = _check_object_for_strings(values) @@ -426,46 +420,32 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance( - values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray) - ): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) # TODO: could use ensure_arraylike here - elif isinstance(values, ABCMultiIndex): - # Avoid raising in extract_array - values = np.array(values) - comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_categorical_dtype(comps.dtype): + if is_categorical_dtype(comps): # TODO(extension) # handle categoricals return cast("Categorical", comps).isin(values) - if needs_i8_conversion(comps.dtype): - # Dispatch to DatetimeLikeArrayMixin.isin - return array(comps).isin(values) - elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): - # e.g. comps are integers and values are datetime64s - return np.zeros(comps.shape, dtype=bool) - comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) + # faster for larger cases to use np.in1d f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # in1d is faster for small sizes - if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): - # If the values include nan we need to check for nan explicitly + if len(comps) > 1_000_000 and not is_object_dtype(comps): + # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d - elif is_integer_dtype(comps.dtype): + elif is_integer_dtype(comps): try: values = values.astype("int64", copy=False) comps = comps.astype("int64", copy=False) @@ -474,7 +454,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: values = values.astype(object) comps = comps.astype(object) - elif is_float_dtype(comps.dtype): + elif is_float_dtype(comps): try: values = values.astype("float64", copy=False) comps = comps.astype("float64", copy=False) @@ -487,7 +467,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def factorize_array( - values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -553,7 +533,7 @@ def factorize( sort: bool = False, na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, -) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]: +) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -683,9 +663,7 @@ def factorize( na_sentinel = -1 dropna = False - if isinstance(values, ABCRangeIndex): - return values.factorize(sort=sort) - elif is_extension_array_dtype(values.dtype): + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype @@ -853,7 +831,7 @@ def value_counts_arraylike(values, dropna: bool): return keys, counts -def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: +def duplicated(values, keep="first") -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -1566,7 +1544,7 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other - negative values raise a ``ValueError``. + other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. @@ -1804,7 +1782,7 @@ def func(arr, indexer, out, fill_value=np.nan): # ------------ # -def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: +def searchsorted(arr, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -1853,7 +1831,7 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: if ( isinstance(arr, np.ndarray) - and is_integer_dtype(arr.dtype) + and is_integer_dtype(arr) and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be @@ -2171,24 +2149,3 @@ def _sort_tuples(values: np.ndarray[tuple]): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] - - -def make_duplicates_of_left_unique_in_right( - left: np.ndarray, right: np.ndarray -) -> np.ndarray: - """ - If left has duplicates, which are also duplicated in right, this duplicated values - are dropped from right, meaning that every duplicate value from left exists only - once in right. - - Parameters - ---------- - left: ndarray - right: ndarray - - Returns - ------- - Duplicates of left are unique in right - """ - left_duplicates = unique(left[duplicated(left)]) - return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c5260deafc0c3..fa4fbe711fbe4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -26,6 +26,7 @@ def frame_apply( axis: Axis = 0, raw: bool = False, result_type: Optional[str] = None, + ignore_failures: bool = False, args=None, kwds=None, ): @@ -42,6 +43,7 @@ def frame_apply( func, raw=raw, result_type=result_type, + ignore_failures=ignore_failures, args=args, kwds=kwds, ) @@ -82,11 +84,13 @@ def __init__( func, raw: bool, result_type: Optional[str], + ignore_failures: bool, args, kwds, ): self.obj = obj self.raw = raw + self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} @@ -279,14 +283,29 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]: results = {} - with option_context("mode.chained_assignment", None): + if self.ignore_failures: + successes = [] for i, v in enumerate(series_gen): - # ignore SettingWithCopy here in case the user mutates - results[i] = self.f(v) - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + try: + results[i] = self.f(v) + except Exception: + pass + else: + successes.append(i) + + # so will work with MultiIndex + if len(successes) < len(res_index): + res_index = res_index.take(successes) + + else: + with option_context("mode.chained_assignment", None): + for i, v in enumerate(series_gen): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 6b28f8f135769..da366c9abf0a4 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -5,15 +5,8 @@ ExtensionArray """ import operator -from typing import Any, Callable -import warnings -import numpy as np - -from pandas._libs import lib - -from pandas.core.construction import extract_array -from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator +from pandas.core.ops import roperator from pandas.core.ops.common import unpack_zerodim_and_defer @@ -147,138 +140,3 @@ def __pow__(self, other): @unpack_zerodim_and_defer("__rpow__") def __rpow__(self, other): return self._arith_method(other, roperator.rpow) - - -def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): - """ - Compatibility with numpy ufuncs. - - See also - -------- - numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ - """ - from pandas.core.generic import NDFrame - from pandas.core.internals import BlockManager - - cls = type(self) - - # for binary ops, use our custom dunder methods - result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - types = tuple(type(x) for x in inputs) - alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - - if len(set(types)) > 1: - # We currently don't handle ufunc(DataFrame, Series) - # well. Previously this raised an internal ValueError. We might - # support it someday, so raise a NotImplementedError. - raise NotImplementedError( - "Cannot apply ufunc {} to mixed DataFrame and Series " - "inputs.".format(ufunc) - ) - axes = self.axes - for obj in alignable[1:]: - # this relies on the fact that we aren't handling mixed - # series / frame ufuncs. - for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): - axes[i] = ax1.union(ax2) - - reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) - inputs = tuple( - x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x - for x, t in zip(inputs, types) - ) - else: - reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) - - if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None - reconstruct_kwargs = {"name": name} - else: - reconstruct_kwargs = {} - - def reconstruct(result): - if lib.is_scalar(result): - return result - if result.ndim != self.ndim: - if method == "outer": - if self.ndim == 2: - # we already deprecated for Series - msg = ( - "outer method for ufunc {} is not implemented on " - "pandas objects. Returning an ndarray, but in the " - "future this will raise a 'NotImplementedError'. " - "Consider explicitly converting the DataFrame " - "to an array with '.to_numpy()' first." - ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) - return result - raise NotImplementedError - return result - if isinstance(result, BlockManager): - # we went through BlockManager.apply - result = self._constructor(result, **reconstruct_kwargs, copy=False) - else: - # we converted an array, lost our axes - result = self._constructor( - result, **reconstruct_axes, **reconstruct_kwargs, copy=False - ) - # TODO: When we support multiple values in __finalize__, this - # should pass alignable to `__fianlize__` instead of self. - # Then `np.add(a, b)` would consider attrs from both a and b - # when a and b are NDFrames. - if len(alignable) == 1: - result = result.__finalize__(self) - return result - - if self.ndim > 1 and ( - len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] - ): - # Just give up on preserving types in the complex case. - # In theory we could preserve them for them. - # * nout>1 is doable if BlockManager.apply took nout and - # returned a Tuple[BlockManager]. - # * len(inputs) > 1 is doable when we know that we have - # aligned blocks / dtypes. - inputs = tuple(np.asarray(x) for x in inputs) - result = getattr(ufunc, method)(*inputs) - elif self.ndim == 1: - # ufunc(series, ...) - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - else: - # ufunc(dataframe) - mgr = inputs[0]._mgr - result = mgr.apply(getattr(ufunc, method)) - - if ufunc.nout > 1: # type: ignore[attr-defined] - result = tuple(reconstruct(x) for x in result) - else: - result = reconstruct(result) - return result diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 5cc6525dc3c9b..d84e2e2ad295b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,6 +1,4 @@ -from __future__ import annotations - -from typing import Any, Optional, Sequence, Type, TypeVar, Union +from typing import Any, Optional, Sequence, TypeVar import numpy as np @@ -22,9 +20,7 @@ from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer -NDArrayBackedExtensionArrayT = TypeVar( - "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" -) +_T = TypeVar("_T", bound="NDArrayBackedExtensionArray") class NDArrayBackedExtensionArray(ExtensionArray): @@ -34,9 +30,7 @@ class NDArrayBackedExtensionArray(ExtensionArray): _ndarray: np.ndarray - def _from_backing_data( - self: NDArrayBackedExtensionArrayT, arr: np.ndarray - ) -> NDArrayBackedExtensionArrayT: + def _from_backing_data(self: _T, arr: np.ndarray) -> _T: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. @@ -58,13 +52,13 @@ def _validate_scalar(self, value): # ------------------------------------------------------------------------ def take( - self: NDArrayBackedExtensionArrayT, + self: _T, indices: Sequence[int], *, allow_fill: bool = False, fill_value: Any = None, axis: int = 0, - ) -> NDArrayBackedExtensionArrayT: + ) -> _T: if allow_fill: fill_value = self._validate_fill_value(fill_value) @@ -80,7 +74,7 @@ def take( def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to a representation - suitable for self._ndarray, raising TypeError if this is not possible. + suitable for self._ndarray, raising ValueError if this is not possible. Parameters ---------- @@ -92,7 +86,7 @@ def _validate_fill_value(self, fill_value): Raises ------ - TypeError + ValueError """ raise AbstractMethodError(self) @@ -119,20 +113,16 @@ def size(self) -> int: def nbytes(self) -> int: return self._ndarray.nbytes - def reshape( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: + def reshape(self: _T, *args, **kwargs) -> _T: new_data = self._ndarray.reshape(*args, **kwargs) return self._from_backing_data(new_data) - def ravel( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: + def ravel(self: _T, *args, **kwargs) -> _T: new_data = self._ndarray.ravel(*args, **kwargs) return self._from_backing_data(new_data) @property - def T(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def T(self: _T) -> _T: new_data = self._ndarray.T return self._from_backing_data(new_data) @@ -148,13 +138,11 @@ def equals(self, other) -> bool: def _values_for_argsort(self): return self._ndarray - def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def copy(self: _T) -> _T: new_data = self._ndarray.copy() return self._from_backing_data(new_data) - def repeat( - self: NDArrayBackedExtensionArrayT, repeats, axis=None - ) -> NDArrayBackedExtensionArrayT: + def repeat(self: _T, repeats, axis=None) -> _T: """ Repeat elements of an array. @@ -166,17 +154,13 @@ def repeat( new_data = self._ndarray.repeat(repeats, axis=axis) return self._from_backing_data(new_data) - def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def unique(self: _T) -> _T: new_data = unique(self._ndarray) return self._from_backing_data(new_data) @classmethod @doc(ExtensionArray._concat_same_type) - def _concat_same_type( - cls: Type[NDArrayBackedExtensionArrayT], - to_concat: Sequence[NDArrayBackedExtensionArrayT], - axis: int = 0, - ) -> NDArrayBackedExtensionArrayT: + def _concat_same_type(cls, to_concat, axis: int = 0): dtypes = {str(x.dtype) for x in to_concat} if len(dtypes) != 1: raise ValueError("to_concat must have the same dtype (tz)", dtypes) @@ -214,9 +198,7 @@ def __setitem__(self, key, value): def _validate_setitem_value(self, value): return value - def __getitem__( - self: NDArrayBackedExtensionArrayT, key: Union[int, slice, np.ndarray] - ) -> Union[NDArrayBackedExtensionArrayT, Any]: + def __getitem__(self, key): if lib.is_integer(key): # fast-path result = self._ndarray[key] @@ -234,9 +216,7 @@ def __getitem__( return result @doc(ExtensionArray.fillna) - def fillna( - self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None - ) -> NDArrayBackedExtensionArrayT: + def fillna(self: _T, value=None, method=None, limit=None) -> _T: value, method = validate_fillna_kwargs(value, method) mask = self.isna() @@ -300,43 +280,3 @@ def __repr__(self) -> str: data = ",\n".join(lines) class_name = f"<{type(self).__name__}>" return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" - - # ------------------------------------------------------------------------ - # __array_function__ methods - - def putmask(self, mask, value): - """ - Analogue to np.putmask(self, mask, value) - - Parameters - ---------- - mask : np.ndarray[bool] - value : scalar or listlike - - Raises - ------ - TypeError - If value cannot be cast to self.dtype. - """ - value = self._validate_setitem_value(value) - - np.putmask(self._ndarray, mask, value) - - def where(self, mask, value): - """ - Analogue to np.where(mask, self, value) - - Parameters - ---------- - mask : np.ndarray[bool] - value : scalar or listlike - - Raises - ------ - TypeError - If value cannot be cast to self.dtype. - """ - value = self._validate_setitem_value(value) - - res_values = np.where(mask, self._ndarray, value) - return self._from_backing_data(res_values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 448025e05422d..afbddc53804ac 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,21 +6,8 @@ This is an experimental API and subject to breaking changes without warning. """ -from __future__ import annotations - import operator -from typing import ( - Any, - Callable, - Dict, - Optional, - Sequence, - Tuple, - Type, - TypeVar, - Union, - cast, -) +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union, cast import numpy as np @@ -50,8 +37,6 @@ _extension_array_shared_docs: Dict[str, str] = dict() -ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray") - class ExtensionArray: """ @@ -256,9 +241,8 @@ def _from_factorized(cls, values, original): # Must be a Sequence # ------------------------------------------------------------------------ - def __getitem__( - self, item: Union[int, slice, np.ndarray] - ) -> Union[ExtensionArray, Any]: + def __getitem__(self, item): + # type (Any) -> Any """ Select a subset of self. @@ -471,7 +455,6 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): @@ -479,11 +462,7 @@ def astype(self, dtype, copy=True): return self else: return self.copy() - - # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -669,7 +648,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": """ Shift values by desired number. @@ -839,7 +818,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: """ Encode the extension array as an enumerated type. @@ -948,7 +927,7 @@ def take( *, allow_fill: bool = False, fill_value: Any = None, - ) -> ExtensionArray: + ) -> "ExtensionArray": """ Take elements from an array. @@ -1037,7 +1016,7 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self: ExtensionArrayT) -> ExtensionArrayT: + def copy(self) -> "ExtensionArray": """ Return a copy of the array. @@ -1117,7 +1096,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def transpose(self, *axes) -> ExtensionArray: + def transpose(self, *axes) -> "ExtensionArray": """ Return a transposed view on this array. @@ -1127,10 +1106,10 @@ def transpose(self, *axes) -> ExtensionArray: return self[:] @property - def T(self) -> ExtensionArray: + def T(self) -> "ExtensionArray": return self.transpose() - def ravel(self, order="C") -> ExtensionArray: + def ravel(self, order="C") -> "ExtensionArray": """ Return a flattened view on this array. @@ -1151,8 +1130,8 @@ def ravel(self, order="C") -> ExtensionArray: @classmethod def _concat_same_type( - cls: Type[ExtensionArrayT], to_concat: Sequence[ExtensionArrayT] - ) -> ExtensionArrayT: + cls, to_concat: Sequence["ExtensionArray"] + ) -> "ExtensionArray": """ Concatenate multiple array of this dtype. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fe66aae23f510..9f011bc9d2651 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -10,7 +10,6 @@ from pandas._config import get_option from pandas._libs import NaT, algos as libalgos, hashtable as htable -from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly, deprecate_kwarg @@ -57,8 +56,6 @@ from pandas.io.formats import console -CategoricalT = TypeVar("CategoricalT", bound="Categorical") - def _cat_compare_op(op): opname = f"__{op.__name__}__" @@ -77,7 +74,7 @@ def func(self, other): "Unordered Categoricals can only compare equality or not" ) if isinstance(other, Categorical): - # Two Categoricals can only be compared if the categories are + # Two Categoricals can only be be compared if the categories are # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." @@ -403,42 +400,20 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: If copy is set to False and dtype is categorical, the original object is returned. """ - if self.dtype is dtype: - result = self.copy() if copy else self - - elif is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) - # GH 10696/18593/18630 + # GH 10696/18593 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - result = self._set_dtype(dtype) - - # TODO: consolidate with ndarray case? - elif is_extension_array_dtype(dtype): - result = array(self, dtype=dtype, copy=copy) - - elif is_integer_dtype(dtype) and self.isna().any(): + if dtype == self.dtype: + return self + return self._set_dtype(dtype) + if is_extension_array_dtype(dtype): + return array(self, dtype=dtype, copy=copy) + if is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - - elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array(self, dtype=dtype, copy=copy) - - else: - # GH8628 (PERF): astype category codes instead of astyping array - try: - astyped_cats = self.categories.astype(dtype=dtype, copy=copy) - except ( - TypeError, # downstream error msg for CategoricalIndex is misleading - ValueError, - ): - msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" - raise ValueError(msg) - - astyped_cats = extract_array(astyped_cats, extract_numpy=True) - result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) - - return result + return np.array(self, dtype=dtype, copy=copy) @cache_readonly def itemsize(self) -> int: @@ -1069,7 +1044,7 @@ def remove_categories(self, removals, inplace=False): new_categories, ordered=self.ordered, rename=False, inplace=inplace ) - def remove_unused_categories(self, inplace=no_default): + def remove_unused_categories(self, inplace=False): """ Remove categories which are not used. @@ -1079,8 +1054,6 @@ def remove_unused_categories(self, inplace=no_default): Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. - .. deprecated:: 1.2.0 - Returns ------- cat : Categorical or None @@ -1094,17 +1067,6 @@ def remove_unused_categories(self, inplace=no_default): remove_categories : Remove the specified categories. set_categories : Set the categories to the specified ones. """ - if inplace is not no_default: - warn( - "The `inplace` parameter in pandas.Categorical." - "remove_unused_categories is deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - inplace = False - inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1228,7 +1190,7 @@ def _validate_searchsorted_value(self, value): def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our - underlying ndarray, raising TypeError if this is not possible. + underlying ndarray, raising ValueError if this is not possible. Parameters ---------- @@ -1240,7 +1202,7 @@ def _validate_fill_value(self, fill_value): Raises ------ - TypeError + ValueError """ if is_valid_nat_for_dtype(fill_value, self.categories.dtype): @@ -1248,7 +1210,7 @@ def _validate_fill_value(self, fill_value): elif fill_value in self.categories: fill_value = self._unbox_scalar(fill_value) else: - raise TypeError( + raise ValueError( f"'fill_value={fill_value}' is not present " "in this Categorical's categories" ) @@ -1697,6 +1659,7 @@ def fillna(self, value=None, method=None, limit=None): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here codes[mask] = new_codes[mask] + else: codes[mask] = new_codes @@ -1956,7 +1919,6 @@ def min(self, *, skipna=True, **kwargs): ------- min : the minimum of this `Categorical` """ - nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_min((), kwargs) self.check_for_ordered("min") @@ -1993,7 +1955,6 @@ def max(self, *, skipna=True, **kwargs): ------- max : the maximum of this `Categorical` """ - nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_max((), kwargs) self.check_for_ordered("max") @@ -2120,9 +2081,7 @@ def equals(self, other: object) -> bool: return False @classmethod - def _concat_same_type( - cls: Type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 - ) -> CategoricalT: + def _concat_same_type(self, to_concat): from pandas.core.dtypes.concat import union_categoricals return union_categoricals(to_concat) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8fa2c734092f4..f2f843886e802 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from datetime import datetime, timedelta import operator from typing import ( @@ -62,7 +60,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import nanops, ops -from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts +from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com @@ -101,8 +99,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _generate_range """ - # _infer_matches -> which infer_dtype strings are close enough to our own - _infer_matches: Tuple[str, ...] _is_recognized_dtype: Callable[[DtypeObj], bool] _recognized_scalars: Tuple[Type, ...] _data: np.ndarray @@ -147,6 +143,14 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT: """ raise AbstractMethodError(self) + @classmethod + def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: + """ + Box an integer unboxed via _unbox_scalar into the native type for + the underlying ndarray. + """ + raise AbstractMethodError(cls) + def _unbox_scalar( self, value: DTScalarOrNaT, setitem: bool = False ) -> Union[np.int64, np.datetime64, np.timedelta64]: @@ -268,9 +272,7 @@ def __array__(self, dtype=None) -> np.ndarray: return np.array(list(self), dtype=object) return self._ndarray - def __getitem__( - self, key: Union[int, slice, np.ndarray] - ) -> Union[DatetimeLikeArrayMixin, DTScalarOrNaT]: + def __getitem__(self, key): """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars @@ -381,11 +383,7 @@ def view(self, dtype=None): # ExtensionArray Interface @classmethod - def _concat_same_type( - cls: Type[DatetimeLikeArrayT], - to_concat: Sequence[DatetimeLikeArrayT], - axis: int = 0, - ) -> DatetimeLikeArrayT: + def _concat_same_type(cls, to_concat, axis: int = 0): new_obj = super()._concat_same_type(to_concat, axis) obj = to_concat[0] @@ -464,7 +462,7 @@ def _validate_comparison_value(self, other): def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to an i8 representation, - raising TypeError if this is not possible. + raising ValueError if this is not possible. Parameters ---------- @@ -476,9 +474,19 @@ def _validate_fill_value(self, fill_value): Raises ------ - TypeError + ValueError """ - return self._validate_scalar(fill_value) + msg = ( + f"'fill_value' should be a {self._scalar_type}. " + f"Got '{str(fill_value)}'." + ) + try: + return self._validate_scalar(fill_value) + except TypeError as err: + if "Cannot compare tz-naive and tz-aware" in str(err): + # tzawareness-compat + raise + raise ValueError(msg) from err def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value @@ -699,59 +707,6 @@ def map(self, mapper): return Index(self).map(mapper).array - def isin(self, values) -> np.ndarray: - """ - Compute boolean array of whether each value is found in the - passed set of values. - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - ndarray[bool] - """ - if not hasattr(values, "dtype"): - values = np.asarray(values) - - if values.dtype.kind in ["f", "i", "u", "c"]: - # TODO: de-duplicate with equals, validate_comparison_value - return np.zeros(self.shape, dtype=bool) - - if not isinstance(values, type(self)): - inferrable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] - if values.dtype == object: - inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferrable: - if inferred == "string": - pass - - elif "mixed" in inferred: - return isin(self.astype(object), values) - else: - return np.zeros(self.shape, dtype=bool) - - try: - values = type(self)._from_sequence(values) - except ValueError: - return isin(self.astype(object), values) - - try: - self._check_compatible_with(values) - except (TypeError, ValueError): - # Includes tzawareness mismatch and IncompatibleFrequencyError - return np.zeros(self.shape, dtype=bool) - - return isin(self.asi8, values.asi8) - # ------------------------------------------------------------------ # Null Handling @@ -1009,7 +964,7 @@ def _add_timedeltalike_scalar(self, other): # adding a scalar preserves freq new_freq = self.freq - return type(self)._simple_new(new_values, dtype=self.dtype, freq=new_freq) + return type(self)(new_values, dtype=self.dtype, freq=new_freq) def _add_timedelta_arraylike(self, other): """ @@ -1613,9 +1568,6 @@ def ceil(self, freq, ambiguous="raise", nonexistent="raise"): # -------------------------------------------------------------- # Frequency Methods - def _maybe_clear_freq(self): - self._freq = None - def _with_freq(self, freq): """ Helper to get a view on the same data, with a new freq. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ce70f929cc79d..a05dc717f83c1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,5 +1,5 @@ from datetime import datetime, time, timedelta, tzinfo -from typing import Optional, Union, cast +from typing import Optional, Union import warnings import numpy as np @@ -154,7 +154,6 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _scalar_type = Timestamp _recognized_scalars = (datetime, np.datetime64) _is_recognized_dtype = is_datetime64_any_dtype - _infer_matches = ("datetime", "datetime64", "date") # define my properties & methods for delegation _bool_ops = [ @@ -445,11 +444,9 @@ def _generate_range( ) if not left_closed and len(index) and index[0] == start: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[1:]) + index = index[1:] if not right_closed and len(index) and index[-1] == end: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[:-1]) + index = index[:-1] dtype = tz_to_dtype(tz) return cls._simple_new(index.asi8, freq=freq, dtype=dtype) @@ -477,6 +474,9 @@ def _check_compatible_with(self, other, setitem: bool = False): if not timezones.tz_compare(self.tz, other.tz): raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") + def _maybe_clear_freq(self): + self._freq = None + # ----------------------------------------------------------------- # Descriptive Properties diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 4aed39d7edb92..a5ebdd8d963e2 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -120,7 +120,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is floating numpy array, preserve its dtype + # if values is floating numpy array, preserve it's dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2897c18acfb09..c9d7632e39228 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -183,7 +183,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is integer numpy array, preserve its dtype + # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index efb66c9a47a97..a2eb506c6747a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,7 +1,7 @@ import operator from operator import le, lt import textwrap -from typing import Sequence, Type, TypeVar +from typing import TYPE_CHECKING, Optional, Tuple, Union, cast import numpy as np @@ -14,6 +14,7 @@ intervals_to_interval_bounds, ) from pandas._libs.missing import NA +from pandas._typing import ArrayLike, Dtype from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -21,7 +22,9 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_any_dtype, + is_dtype_equal, is_float_dtype, + is_integer, is_integer_dtype, is_interval_dtype, is_list_like, @@ -49,7 +52,9 @@ from pandas.core.indexes.base import ensure_index from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer -IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") +if TYPE_CHECKING: + from pandas import Index + from pandas.core.arrays import DatetimeArray, TimedeltaArray _interval_shared_docs = {} @@ -175,6 +180,17 @@ def __new__( left = data._left right = data._right closed = closed or data.closed + + if dtype is None or data.dtype == dtype: + # This path will preserve id(result._combined) + # TODO: could also validate dtype before going to simple_new + combined = data._combined + if copy: + combined = combined.copy() + result = cls._simple_new(combined, closed=closed) + if verify_integrity: + result._validate() + return result else: # don't allow scalars @@ -192,83 +208,22 @@ def __new__( ) closed = closed or infer_closed - return cls._simple_new( - left, - right, - closed, - copy=copy, - dtype=dtype, - verify_integrity=verify_integrity, - ) + closed = closed or "right" + left, right = _maybe_cast_inputs(left, right, copy, dtype) + combined = _get_combined_data(left, right) + result = cls._simple_new(combined, closed=closed) + if verify_integrity: + result._validate() + return result @classmethod - def _simple_new( - cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True - ): + def _simple_new(cls, data, closed="right"): result = IntervalMixin.__new__(cls) - closed = closed or "right" - left = ensure_index(left, copy=copy) - right = ensure_index(right, copy=copy) - - if dtype is not None: - # GH 19262: dtype must be an IntervalDtype to override inferred - dtype = pandas_dtype(dtype) - if not is_interval_dtype(dtype): - msg = f"dtype must be an IntervalDtype, got {dtype}" - raise TypeError(msg) - elif dtype.subtype is not None: - left = left.astype(dtype.subtype) - right = right.astype(dtype.subtype) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ( - f"must not have differing left [{type(left).__name__}] and " - f"right [{type(right).__name__}] types" - ) - raise ValueError(msg) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH 19016 - msg = ( - "category, object, and string subtypes are not supported " - "for IntervalArray" - ) - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = "Period dtypes are not supported, use a PeriodIndex instead" - raise ValueError(msg) - elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): - msg = ( - "left and right must have the same time zone, got " - f"'{left.tz}' and '{right.tz}'" - ) - raise ValueError(msg) - - # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array - - left = maybe_upcast_datetimelike_array(left) - left = extract_array(left, extract_numpy=True) - right = maybe_upcast_datetimelike_array(right) - right = extract_array(right, extract_numpy=True) - - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() - - result._left = left - result._right = right + result._combined = data + result._left = data[:, 0] + result._right = data[:, 1] result._closed = closed - if verify_integrity: - result._validate() return result @classmethod @@ -403,10 +358,16 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) + if len(left) != len(right): + raise ValueError("left and right must have the same length") - return cls._simple_new( - left, right, closed, copy=copy, dtype=dtype, verify_integrity=True - ) + closed = closed or "right" + left, right = _maybe_cast_inputs(left, right, copy, dtype) + combined = _get_combined_data(left, right) + + result = cls._simple_new(combined, closed) + result._validate() + return result _interval_shared_docs["from_tuples"] = textwrap.dedent( """ @@ -512,19 +473,6 @@ def _validate(self): msg = "left side of interval must be <= right side" raise ValueError(msg) - def _shallow_copy(self, left, right): - """ - Return a new IntervalArray with the replacement attributes - - Parameters - ---------- - left : Index - Values to be used for the left-side of the intervals. - right : Index - Values to be used for the right-side of the intervals. - """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - # --------------------------------------------------------------------- # Descriptive @@ -552,18 +500,20 @@ def __len__(self) -> int: def __getitem__(self, key): key = check_array_indexer(self, key) - left = self._left[key] - right = self._right[key] - if not isinstance(left, (np.ndarray, ExtensionArray)): - # scalar - if is_scalar(left) and isna(left): + result = self._combined[key] + + if is_integer(key): + left, right = result[0], result[1] + if isna(left): return self._fill_value return Interval(left, right, self.closed) - if np.ndim(left) > 1: + + # TODO: need to watch out for incorrectly-reducing getitem + if np.ndim(result) > 2: # GH#30588 multi-dimensional indexer disallowed raise ValueError("multi-dimensional indexing not allowed") - return self._shallow_copy(left, right) + return type(self)._simple_new(result, closed=self.closed) def __setitem__(self, key, value): value_left, value_right = self._validate_setitem_value(value) @@ -667,24 +617,6 @@ def __lt__(self, other): def __le__(self, other): return self._cmp_method(other, operator.le) - def argsort( - self, - ascending: bool = True, - kind: str = "quicksort", - na_position: str = "last", - *args, - **kwargs, - ) -> np.ndarray: - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - - if ascending and kind == "quicksort" and na_position == "last": - return np.lexsort((self.right, self.left)) - - # TODO: other cases we can use lexsort for? much more performant. - return super().argsort( - ascending=ascending, kind=kind, na_position=na_position, **kwargs - ) - def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -717,11 +649,12 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - value_left, value_right = self._validate_fill_value(value) + value_left, value_right = self._validate_fillna_value(value) left = self.left.fillna(value=value_left) right = self.right.fillna(value=value_right) - return self._shallow_copy(left, right) + combined = _get_combined_data(left, right) + return type(self)._simple_new(combined, closed=self.closed) def astype(self, dtype, copy=True): """ @@ -763,9 +696,11 @@ def astype(self, dtype, copy=True): f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) raise TypeError(msg) from err - return self._shallow_copy(new_left, new_right) + # TODO: do astype directly on self._combined + combined = _get_combined_data(new_left, new_right) + return type(self)._simple_new(combined, closed=self.closed) elif is_categorical_dtype(dtype): - return Categorical(np.asarray(self), dtype=dtype) + return Categorical(np.asarray(self)) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -787,9 +722,7 @@ def equals(self, other) -> bool: ) @classmethod - def _concat_same_type( - cls: Type[IntervalArrayT], to_concat: Sequence[IntervalArrayT] - ) -> IntervalArrayT: + def _concat_same_type(cls, to_concat): """ Concatenate multiple IntervalArray @@ -806,11 +739,13 @@ def _concat_same_type( raise ValueError("Intervals must all be closed on the same side.") closed = closed.pop() + # TODO: will this mess up on dt64tz? left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) - return cls._simple_new(left, right, closed=closed, copy=False) + combined = _get_combined_data(left, right) # TODO: 1-stage concat + return cls._simple_new(combined, closed=closed) - def copy(self: IntervalArrayT) -> IntervalArrayT: + def copy(self): """ Return a copy of the array. @@ -818,11 +753,8 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: ------- IntervalArray """ - left = self._left.copy() - right = self._right.copy() - closed = self.closed - # TODO: Could skip verify_integrity here. - return type(self).from_arrays(left, right, closed=closed) + combined = self._combined.copy() + return type(self)._simple_new(combined, closed=self.closed) def isna(self) -> np.ndarray: return isna(self._left) @@ -915,7 +847,8 @@ def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwarg self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) - return self._shallow_copy(left_take, right_take) + combined = _get_combined_data(left_take, right_take) + return type(self)._simple_new(combined, closed=self.closed) def _validate_listlike(self, value): # list-like of intervals @@ -937,7 +870,7 @@ def _validate_scalar(self, value): # GH#18295 left = right = value else: - raise TypeError( + raise ValueError( "can only insert Interval objects and NA into an IntervalArray" ) return left, right @@ -945,6 +878,17 @@ def _validate_scalar(self, value): def _validate_fill_value(self, value): return self._validate_scalar(value) + def _validate_fillna_value(self, value): + # This mirrors Datetimelike._validate_fill_value + try: + return self._validate_scalar(value) + except ValueError as err: + msg = ( + "'IntervalArray.fillna' only supports filling with a " + f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." + ) + raise TypeError(msg) from err + def _validate_setitem_value(self, value): needs_float_conversion = False @@ -1228,10 +1172,7 @@ def set_closed(self, closed): if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) - - return type(self)._simple_new( - left=self._left, right=self._right, closed=closed, verify_integrity=False - ) + return type(self)._simple_new(self._combined, closed=closed) _interval_shared_docs[ "is_non_overlapping_monotonic" @@ -1372,9 +1313,8 @@ def to_tuples(self, na_tuple=True): @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) - left_repeat = self.left.repeat(repeats) - right_repeat = self.right.repeat(repeats) - return self._shallow_copy(left=left_repeat, right=right_repeat) + combined = self._combined.repeat(repeats, 0) + return type(self)._simple_new(combined, closed=self.closed) _interval_shared_docs["contains"] = textwrap.dedent( """ @@ -1457,3 +1397,92 @@ def maybe_convert_platform_interval(values): values = np.asarray(values) return maybe_convert_platform(values) + + +def _maybe_cast_inputs( + left_orig: Union["Index", ArrayLike], + right_orig: Union["Index", ArrayLike], + copy: bool, + dtype: Optional[Dtype], +) -> Tuple["Index", "Index"]: + left = ensure_index(left_orig, copy=copy) + right = ensure_index(right_orig, copy=copy) + + if dtype is not None: + # GH#19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) + dtype = cast(IntervalDtype, dtype) + if dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" + ) + raise ValueError(msg) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH#19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = "Period dtypes are not supported, use a PeriodIndex instead" + raise ValueError(msg) + elif isinstance(left, ABCDatetimeIndex) and not is_dtype_equal( + left.dtype, right.dtype + ): + left_arr = cast("DatetimeArray", left._data) + right_arr = cast("DatetimeArray", right._data) + msg = ( + "left and right must have the same time zone, got " + f"'{left_arr.tz}' and '{right_arr.tz}'" + ) + raise ValueError(msg) + + return left, right + + +def _get_combined_data( + left: Union["Index", ArrayLike], right: Union["Index", ArrayLike] +) -> Union[np.ndarray, "DatetimeArray", "TimedeltaArray"]: + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) + left = extract_array(left, extract_numpy=True) + right = maybe_upcast_datetimelike_array(right) + right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + + if isinstance(left, np.ndarray): + assert isinstance(right, np.ndarray) # for mypy + combined = np.concatenate( + [left.reshape(-1, 1), right.reshape(-1, 1)], + axis=1, + ) + else: + left = cast(Union["DatetimeArray", "TimedeltaArray"], left) + right = cast(Union["DatetimeArray", "TimedeltaArray"], right) + combined = type(left)._concat_same_type( + [left.reshape(-1, 1), right.reshape(-1, 1)], + axis=1, + ) + return combined diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index caed932cd7857..9cc4cc72e4c8e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1,6 +1,4 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar import numpy as np @@ -58,7 +56,7 @@ def itemsize(self) -> int: return self.numpy_dtype.itemsize @classmethod - def construct_array_type(cls) -> Type[BaseMaskedArray]: + def construct_array_type(cls) -> Type["BaseMaskedArray"]: """ Return the array type associated with this dtype. @@ -102,9 +100,7 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) - def __getitem__( - self, item: Union[int, slice, np.ndarray] - ) -> Union[BaseMaskedArray, Any]: + def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value @@ -265,9 +261,7 @@ def nbytes(self) -> int: return self._data.nbytes + self._mask.nbytes @classmethod - def _concat_same_type( - cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] - ) -> BaseMaskedArrayT: + def _concat_same_type(cls: Type[BaseMaskedArrayT], to_concat) -> BaseMaskedArrayT: data = np.concatenate([x._data for x in to_concat]) mask = np.concatenate([x._mask for x in to_concat]) return cls(data, mask) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4eb67dcd12728..0cdce1eabccc6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -144,7 +144,7 @@ class PandasArray( # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for - # that _typ to ensure that users don't unnecessarily use EAs inside + # that _typ to ensure that that users don't unnecessarily use EAs inside # pandas internals, which turns off things like block consolidation. _typ = "npy_extension" __array_priority__ = 1000 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 50ed526cf01e9..80882acceb56a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -124,7 +124,6 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): _scalar_type = Period _recognized_scalars = (Period,) _is_recognized_dtype = is_period_dtype - _infer_matches = ("period",) # Names others delegate to us _other_ops: List[str] = [] diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c591f81390388..d976526955ac2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -4,7 +4,7 @@ from collections import abc import numbers import operator -from typing import Any, Callable, Sequence, Type, TypeVar, Union +from typing import Any, Callable, Union import warnings import numpy as np @@ -56,7 +56,6 @@ # ---------------------------------------------------------------------------- # Array -SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") _sparray_doc_kwargs = dict(klass="SparseArray") @@ -398,11 +397,8 @@ def __init__( @classmethod def _simple_new( - cls: Type[SparseArrayT], - sparse_array: np.ndarray, - sparse_index: SparseIndex, - dtype: SparseDtype, - ) -> SparseArrayT: + cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype + ) -> "SparseArray": new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array @@ -941,14 +937,12 @@ def searchsorted(self, v, side="left", sorter=None): v = np.asarray(v) return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) - def copy(self: SparseArrayT) -> SparseArrayT: + def copy(self): values = self.sp_values.copy() return self._simple_new(values, self.sp_index, self.dtype) @classmethod - def _concat_same_type( - cls: Type[SparseArrayT], to_concat: Sequence[SparseArrayT] - ) -> SparseArrayT: + def _concat_same_type(cls, to_concat): fill_value = to_concat[0].fill_value values = [] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e75305e55348c..3b297e7c2b13b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -282,6 +282,10 @@ def __setitem__(self, key, value): super().__setitem__(key, value) + def fillna(self, value=None, method=None, limit=None): + # TODO: validate dtype + return super().fillna(value, method, limit) + def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, StringDtype): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py deleted file mode 100644 index 184fbc050036b..0000000000000 --- a/pandas/core/arrays/string_arrow.py +++ /dev/null @@ -1,625 +0,0 @@ -from __future__ import annotations - -from distutils.version import LooseVersion -from typing import TYPE_CHECKING, Any, Sequence, Type, Union - -import numpy as np - -from pandas._libs import lib, missing as libmissing -from pandas.util._validators import validate_fillna_kwargs - -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna - -from pandas.api.types import ( - is_array_like, - is_bool_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays.base import ExtensionArray -from pandas.core.indexers import check_array_indexer, validate_indices -from pandas.core.missing import get_fill_func - -try: - import pyarrow as pa -except ImportError: - pa = None -else: - # our min supported version of pyarrow, 0.15.1, does not have a compute - # module - try: - import pyarrow.compute as pc - except ImportError: - pass - else: - ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } - - -if TYPE_CHECKING: - from pandas import Series - - -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> from pandas.core.arrays.string_arrow import ArrowStringDtype - >>> ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - @property - def type(self) -> Type[str]: - return str - - @classmethod - def construct_array_type(cls) -> Type["ArrowStringArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( - self, array: Union["pa.Array", "pa.ChunkedArray"] - ) -> "ArrowStringArray": - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - -class ArrowStringArray(OpsMixin, ExtensionArray): - """ - Extension array for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringArray is considered experimental. The implementation and - parts of the API may change without warning. - - Parameters - ---------- - values : pyarrow.Array or pyarrow.ChunkedArray - The array of data. - - Attributes - ---------- - None - - Methods - ------- - None - - See Also - -------- - array - The recommended function for creating a ArrowStringArray. - Series.str - The string methods are available on Series backed by - a ArrowStringArray. - - Notes - ----- - ArrowStringArray returns a BooleanArray for comparison methods. - - Examples - -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") - - ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string - """ - - _dtype = ArrowStringDtype() - - def __init__(self, values): - self._chk_pyarrow_available() - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - - if not pa.types.is_string(self._data.type): - raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" - ) - - @classmethod - def _chk_pyarrow_available(cls) -> None: - # TODO: maybe update import_optional_dependency to allow a minimum - # version to be specified rather than use the global minimum - if pa is None or LooseVersion(pa.__version__) < "1.0.0": - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." - raise ImportError(msg) - - @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - cls._chk_pyarrow_available() - # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value - scalars = lib.ensure_string_array(scalars, copy=False) - return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) - - @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence(strings, dtype=dtype, copy=copy) - - @property - def dtype(self) -> ArrowStringDtype: - """ - An instance of 'ArrowStringDtype'. - """ - return self._dtype - - def __array__(self, dtype=None) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) - - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self._data - - def to_numpy( - self, dtype=None, copy: bool = False, na_value=lib.no_default - ) -> np.ndarray: - """ - Convert to a NumPy ndarray. - """ - # TODO: copy argument is ignored - - if na_value is lib.no_default: - na_value = self._dtype.na_value - result = self._data.__array__(dtype=dtype) - result[isna(result)] = na_value - return result - - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - - @classmethod - def _from_factorized(cls, values, original): - return cls._from_sequence(values) - - @classmethod - def _concat_same_type(cls, to_concat) -> ArrowStringArray: - """ - Concatenate multiple ArrowStringArray. - - Parameters - ---------- - to_concat : sequence of ArrowStringArray - - Returns - ------- - ArrowStringArray - """ - return cls( - pa.chunked_array( - [array for ea in to_concat for array in ea._data.iterchunks()] - ) - ) - - def __getitem__(self, item: Any) -> Any: - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return self._as_pandas_scalar(value) - - def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): - scalar = arrow_scalar.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - - def fillna(self, value=None, method=None, limit=None): - """ - Fill NA/NaN values using the specified method. - - Parameters - ---------- - value : scalar, array-like - If a scalar value is passed it is used to fill all missing values. - Alternatively, an array-like 'value' can be given. It's expected - that the array-like have the same length as 'self'. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - Returns - ------- - ExtensionArray - With NA/NaN filled. - """ - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f"expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - func = get_fill_func(method) - new_values = func(self.to_numpy(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - - def _reduce(self, name, skipna=True, **kwargs): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def isna(self) -> np.ndarray: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self._data.is_null().to_pandas().values - - def copy(self) -> ArrowStringArray: - """ - Return a shallow copy of the array. - - Returns - ------- - ArrowStringArray - """ - return type(self)(self._data) - - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray - - pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, ArrowStringArray): - result = pc_func(self._data, other._data) - elif isinstance(other, np.ndarray): - result = pc_func(self._data, other) - elif is_scalar(other): - try: - result = pc_func(self._data, pa.scalar(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - mask = isna(self) | isna(other) - valid = ~mask - result = np.zeros(len(self), dtype="bool") - result[valid] = op(np.array(self)[valid], other) - return BooleanArray(result, mask) - else: - return NotImplemented - - # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return BooleanArray._from_sequence(result.to_pandas().values) - - def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: - """Set one or more values inplace. - - Parameters - ---------- - key : int, ndarray, or slice - When called from, e.g. ``Series.__setitem__``, ``key`` will be - one of - - * scalar int - * ndarray of integers. - * boolean ndarray - * slice object - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Returns - ------- - None - """ - key = check_array_indexer(self, key) - - if is_integer(key): - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): - value = None - elif not isinstance(value, str): - raise ValueError("Scalar must be NA or str") - - # Slice data and insert inbetween - new_data = [ - *self._data[0:key].chunks, - pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, - ] - self._data = pa.chunked_array(new_data) - else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - elif isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) - - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) - else: - value = np.asarray(value) - - if len(key_array) != len(value): - raise ValueError("Length of indexer and values mismatch") - - for k, v in zip(key_array, value): - self[k] = v - - def take( - self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> "ExtensionArray": - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - indices_array = indices - - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import Index, Series - - vc = self._data.value_counts() - - # Index cannot hold ExtensionArrays yet - index = Index(type(self)(vc.field(0)).astype(object)) - # No missings, so we can adhere to the interface and return a numpy array. - counts = np.array(vc.field(1)) - - if dropna and self._data.null_count > 0: - raise NotImplementedError("yo") - - return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 998117cc49d50..d9ecbc874cd59 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -104,7 +104,6 @@ class TimedeltaArray(dtl.TimelikeOps): _scalar_type = Timedelta _recognized_scalars = (timedelta, np.timedelta64, Tick) _is_recognized_dtype = is_timedelta64_dtype - _infer_matches = ("timedelta", "timedelta64") __array_priority__ = 1000 # define my properties & methods for delegation @@ -314,6 +313,9 @@ def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass + def _maybe_clear_freq(self): + self._freq = None + # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods diff --git a/pandas/core/base.py b/pandas/core/base.py index 5f724d9e89d05..4760b92ad5fec 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -269,14 +269,12 @@ def __getitem__(self, key): return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): - # error: "SelectionMixin" has no attribute "obj" [attr-defined] - if key not in self.obj.columns: # type: ignore[attr-defined] + if key not in self.obj.columns: raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: - # error: "SelectionMixin" has no attribute "obj" [attr-defined] - if key not in self.obj: # type: ignore[attr-defined] + if key not in self.obj: raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) @@ -921,9 +919,10 @@ def _map_values(self, mapper, na_action=None): # "astype" [attr-defined] values = self.astype(object)._values # type: ignore[attr-defined] if na_action == "ignore": - map_f = lambda values, f: lib.map_infer_mask( - values, f, isna(values).view(np.uint8) - ) + + def map_f(values, f): + return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) + elif na_action is None: map_f = lib.map_infer else: @@ -983,9 +982,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 + 1.0 1 2.0 1 4.0 1 - 1.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -994,9 +993,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 + 1.0 0.2 2.0 0.2 4.0 0.2 - 1.0 0.2 dtype: float64 **bins** @@ -1018,10 +1017,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 + 1.0 1 2.0 1 - NaN 1 4.0 1 - 1.0 1 + NaN 1 dtype: int64 """ result = value_counts( diff --git a/pandas/core/common.py b/pandas/core/common.py index cdcbc43055052..9b6133d2f7627 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,7 +24,12 @@ is_extension_array_dtype, is_integer, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -37,13 +42,13 @@ class SettingWithCopyWarning(Warning): pass -def flatten(line): +def flatten(l): """ Flatten an arbitrarily nested sequence. Parameters ---------- - line : sequence + l : sequence The non string sequence to flatten Notes @@ -54,11 +59,11 @@ def flatten(line): ------- flattened : generator """ - for element in line: - if iterable_not_string(element): - yield from flatten(element) + for el in l: + if iterable_not_string(el): + yield from flatten(el) else: - yield element + yield el def consensus_name_attr(objs): @@ -100,7 +105,7 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndexClass)) or ( + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: @@ -108,9 +113,7 @@ def is_bool_indexer(key: Any) -> bool: if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" - if lib.infer_dtype(key) == "boolean" and isna(key).any(): - # Don't raise on e.g. ["A", "B", np.nan], see - # test_loc_getitem_list_of_labels_categoricalindex_with_na + if isna(key).any(): raise ValueError(na_msg) return False return True @@ -277,23 +280,20 @@ def is_null_slice(obj) -> bool: ) -def is_true_slices(line): +def is_true_slices(l): """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "l": return a list of booleans with same length. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + return [isinstance(k, slice) and not is_null_slice(k) for k in l] # TODO: used only once in indexing; belongs elsewhere? -def is_full_slice(obj, line) -> bool: +def is_full_slice(obj, l) -> bool: """ We have a full length slice. """ return ( - isinstance(obj, slice) - and obj.start == 0 - and obj.stop == line - and obj.step is None + isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None ) @@ -466,11 +466,8 @@ def convert_to_list_like( Convert list-like or scalar input to list-like. List, numpy and pandas array-like inputs are returned unmodified whereas others are converted to list. """ - if isinstance( - values, (list, np.ndarray, ABCIndexClass, ABCSeries, ABCExtensionArray) - ): - # np.ndarray resolving as Any gives a false positive - return values # type: ignore[return-value] + if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): + return values elif isinstance(values, abc.Iterable) and not isinstance(values, str): return list(values) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 5ad3e78a76866..8a8b0d564ea49 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,10 +1,9 @@ """ Core eval alignment algorithms. """ -from __future__ import annotations from functools import partial, wraps -from typing import TYPE_CHECKING, Dict, Optional, Sequence, Tuple, Type, Union +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np @@ -18,16 +17,13 @@ import pandas.core.common as com from pandas.core.computation.common import result_type_many -if TYPE_CHECKING: - from pandas.core.indexes.api import Index - def _align_core_single_unary_op( term, -) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, Index]]]: +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: typ: Union[partial, Type[FrameOrSeries]] - axes: Optional[Dict[str, Index]] = None + axes: Optional[Dict[str, int]] = None if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) @@ -40,8 +36,8 @@ def _align_core_single_unary_op( def _zip_axes_from_type( - typ: Type[FrameOrSeries], new_axes: Sequence[Index] -) -> Dict[str, Index]: + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index a1bebc92046ae..86e125b6b909b 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -8,8 +8,6 @@ import tokenize from typing import Iterator, Tuple -from pandas._typing import Label - # A token value Python's tokenizer probably will never use. BACKTICK_QUOTED_STRING = 100 @@ -93,7 +91,7 @@ def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, tokval -def clean_column_name(name: "Label") -> "Label": +def clean_column_name(name: str) -> str: """ Function to emulate the cleaning of a backtick quoted name. @@ -104,12 +102,12 @@ def clean_column_name(name: "Label") -> "Label": Parameters ---------- - name : hashable + name : str Name to be cleaned. Returns ------- - name : hashable + name : str Returns the name after tokenizing and cleaning. Notes diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 0498d4d171c00..6ec637a8b4845 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -430,10 +430,6 @@ def visit_Subscript(self, node, **kwargs): except AttributeError: pass - if isinstance(slobj, Term): - # In py39 np.ndarray lookups with Term containing int raise - slobj = slobj.value - try: return self.const_type(value[slobj], self.env) except TypeError as err: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f9ebe3f1e185e..7901e150a7ff4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -351,7 +351,7 @@ def array( return result -def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayLike]: +def extract_array(obj: AnyArrayLike, extract_numpy: bool = False) -> ArrayLike: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -399,7 +399,9 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() - return obj + # error: Incompatible return value type (got "Index", expected "ExtensionArray") + # error: Incompatible return value type (got "Series", expected "ExtensionArray") + return obj # type: ignore[return-value] def sanitize_array( diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c2be81cd46b3b..8630867c64f88 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -99,8 +99,9 @@ def __eq__(self, other: Any) -> bool: By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type and all of the attributes - in ``self._metadata`` are equal between `self` and `other`. + * it's an instance of this type and all of the + the attributes in ``self._metadata`` are equal between + `self` and `other`. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0f0e82f4ad4e2..9758eae60c262 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -385,17 +385,13 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg - # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + # Everything can be be converted to StringArrays, but we may not want to convert + if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": return obj try: @@ -1200,7 +1196,7 @@ def soft_convert_objects( elif conversion_count > 1 and coerce: raise ValueError( "Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when coerce=True." + "'timedelta' can be True when when coerce=True." ) if not is_object_dtype(values.dtype): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b4f6d587c6642..14184f044ae95 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1727,7 +1727,7 @@ def _validate_date_like_dtype(dtype) -> None: ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the - frequency provided is too specific) + the frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a9b0498081511..a38d9cbad0d64 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,11 +21,11 @@ from pandas.core.construction import array -def _get_dtype_kinds(arrays) -> Set[str]: +def _get_dtype_kinds(l) -> Set[str]: """ Parameters ---------- - arrays : list of arrays + l : list of arrays Returns ------- @@ -33,7 +33,7 @@ def _get_dtype_kinds(arrays) -> Set[str]: A set of kinds that exist in this list of arrays. """ typs: Set[str] = set() - for arr in arrays: + for arr in l: # Note: we use dtype.kind checks because they are much more performant # than is_foo_dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 07280702cf06f..01b34187997cb 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -47,7 +47,7 @@ class PandasExtensionDtype(ExtensionDtype): type: Any kind: Any # The Any type annotations above are here only because mypy seems to have a - # problem dealing with multiple inheritance from PandasExtensionDtype + # problem dealing with with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 0e5867809fe52..7d2549713c6bc 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -1,11 +1,4 @@ """ define generic base classes for pandas objects """ -from __future__ import annotations - -from typing import TYPE_CHECKING, Type, cast - -if TYPE_CHECKING: - from pandas import DataFrame, Series - from pandas.core.generic import NDFrame # define abstract base classes to enable isinstance type checking on our @@ -23,6 +16,7 @@ def _check(cls, inst) -> bool: return meta(name, tuple(), dct) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) @@ -59,17 +53,9 @@ def _check(cls, inst) -> bool: }, ) -ABCNDFrame = cast( - "Type[NDFrame]", - create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")), -) -ABCSeries = cast( - "Type[Series]", - create_pandas_abc_type("ABCSeries", "_typ", ("series",)), -) -ABCDataFrame = cast( - "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) -) +ABCNDFrame = create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")) +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c9030a0b2423a..9ce5ef2fc3cfe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,7 +118,7 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, common as com, generic, nanops, ops +from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import ( aggregate, @@ -157,9 +157,9 @@ from pandas.core.series import Series from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort -from pandas.io.common import get_handle +from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import BaseInfo, DataFrameInfo +from pandas.io.formats.info import DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -205,14 +205,12 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. -When performing a cross merge, no column specifications to merge on are -allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -223,11 +221,6 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - * cross: creates the cartesian product from both frames, preserves the order - of the left keys. - - .. versionadded:: 1.2.0 - on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -348,44 +341,6 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') - ->>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) ->>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) ->>> df1 - a b -0 foo 1 -1 bar 2 ->>> df2 - a c -0 foo 3 -1 baz 4 - ->>> df1.merge(df2, how='inner', on='a') - a b c -0 foo 1 3 - ->>> df1.merge(df2, how='left', on='a') - a b c -0 foo 1 3.0 -1 bar 2 NaN - ->>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) ->>> df2 = pd.DataFrame({'right': [7, 8]}) ->>> df1 - left -0 foo -1 bar ->>> df2 - right -0 7 -1 8 - ->>> df1.merge(df2, how='cross') - left right -0 foo 7 -1 foo 8 -2 bar 7 -3 bar 8 """ @@ -479,7 +434,6 @@ class DataFrame(NDFrame, OpsMixin): _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" - _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property def _constructor(self) -> Type[DataFrame]: @@ -772,7 +726,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(line) for line in value.split("\n")) + repr_width = max(len(l) for l in value.split("\n")) return repr_width < width @@ -2112,7 +2066,6 @@ def _from_arrays( ) return cls(mgr) - @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, @@ -2165,7 +2118,7 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {{114, 117, 118, 119, None}}, default 114 + version : {114, 117, 118, 119, None}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and @@ -2194,17 +2147,23 @@ def to_stata( compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies - compression mode. Compression mode must be one of {{'infer', 'gzip', - 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and + compression mode. Compression mode must be one of {'infer', 'gzip', + 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `fname` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). If dict and compression mode is one of {{'zip', - 'gzip', 'bz2'}}, or inferred as one of the above, other entries + compression). If dict and compression mode is one of {'zip', + 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionadded:: 1.1.0 - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2227,9 +2186,9 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', + >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}}) + ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): @@ -2296,7 +2255,6 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], - storage_options=_shared_docs["storage_options"], examples="""Examples -------- >>> df = pd.DataFrame( @@ -2343,13 +2301,12 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - - with get_handle(buf, mode, storage_options=storage_options) as handles: - assert not isinstance(handles.handle, (str, mmap.mmap)) - handles.handle.writelines(result) + ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) + assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap)) + ioargs.filepath_or_buffer.writelines(result) + ioargs.close() return None - @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, @@ -2383,12 +2340,12 @@ def to_parquet( Previously this was "fname" - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. - compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' + compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. @@ -2408,7 +2365,13 @@ def to_parquet( .. versionadded:: 0.24.0 - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2435,7 +2398,7 @@ def to_parquet( Examples -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_parquet('df.parquet.gzip', ... compression='gzip') # doctest: +SKIP >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP @@ -2569,28 +2532,16 @@ def to_html( @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=dedent( - """\ - max_cols : int, optional + max_cols_sub=( + """max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used.""" - ), - show_counts_sub=dedent( - """\ - show_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the DataFrame is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - null_counts : bool, optional - .. deprecated:: 1.2.0 - Use show_counts instead.""" + ``pandas.options.display.max_info_columns`` is used. + """ ), - examples_sub=dedent( - """\ + examples_sub=( + """ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2673,42 +2624,31 @@ def to_html( dtypes: object(3) memory usage: 165.9 MB""" ), - see_also_sub=dedent( - """\ + see_also_sub=( + """ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), - version_added_sub="", ) - @doc(BaseInfo.render) + @doc(DataFrameInfo.to_buffer) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - show_counts: Optional[bool] = None, null_counts: Optional[bool] = None, ) -> None: - if null_counts is not None: - if show_counts is not None: - raise ValueError("null_counts used with show_counts. Use show_counts.") - warnings.warn( - "null_counts is deprecated. Use show_counts instead", - FutureWarning, - stacklevel=2, - ) - show_counts = null_counts info = DataFrameInfo( data=self, memory_usage=memory_usage, ) - info.render( + info.to_buffer( buf=buf, max_cols=max_cols, verbose=verbose, - show_counts=show_counts, + show_counts=null_counts, ) def memory_usage(self, index=True, deep=False) -> Series: @@ -2793,7 +2733,7 @@ def memory_usage(self, index=True, deep=False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + 5216 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -2994,7 +2934,7 @@ def __getitem__(self, key): if is_hashable(key): # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: - if isinstance(self.columns, MultiIndex): + if self.columns.nlevels > 1: return self._getitem_multilevel(key) return self._get_item_cache(key) @@ -4630,7 +4570,7 @@ def set_index( append : bool, default False Whether to append columns to existing index. inplace : bool, default False - If True, modifies the DataFrame in place (do not create a new object). + Modify the DataFrame in place (do not create a new object). verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this @@ -6031,16 +5971,13 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): # maybe_align_as_frame ensures we do not have an ndarray here assert not isinstance(right, np.ndarray) - arrays = [ - array_op(_left, _right) - for _left, _right in zip(self._iter_column_arrays(), right) - ] + arrays = [array_op(l, r) for l, r in zip(self._iter_column_arrays(), right)] elif isinstance(right, Series): assert right.index.equals(self.index) # Handle other cases later right = right._values - arrays = [array_op(left, right) for left in self._iter_column_arrays()] + arrays = [array_op(l, right) for l in self._iter_column_arrays()] else: # Remaining cases have less-obvious dispatch rules @@ -6532,7 +6469,7 @@ def update( 1 b e 2 c f - For Series, its name attribute must be set. + For Series, it's name attribute must be set. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) @@ -8128,15 +8065,6 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): - if how == "cross": - return merge( - self, - other, - how=how, - on=on, - suffixes=(lsuffix, rsuffix), - sort=sort, - ) return merge( self, other, @@ -8837,7 +8765,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only is not None or axis == 0: + if numeric_only is not None: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -8862,14 +8790,36 @@ def _get_data() -> DataFrame: # GH#35865 careful to cast explicitly to object nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) out[:] = np.array(nvs, dtype=object) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) return out assert numeric_only is None + if not self._is_homogeneous_type or self._mgr.any_extension_types: + # try to avoid self.values call + + if filter_type is None and axis == 0: + # operate column-wise + + # numeric_only must be None here, as other cases caught above + + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=func, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0].rename(None) + return result + data = self values = data.values diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e12053b71a815..222cf0af5869b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -70,7 +70,6 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_float, is_list_like, @@ -87,7 +86,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import arraylike, indexing, missing, nanops +from pandas.core import indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -512,7 +511,7 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]] return d @final - def _get_index_resolvers(self) -> Dict[Label, Union[Series, MultiIndex]]: + def _get_index_resolvers(self) -> Dict[str, Union[Series, MultiIndex]]: from pandas.core.computation.parsing import clean_column_name d: Dict[str, Union[Series, MultiIndex]] = {} @@ -522,7 +521,7 @@ def _get_index_resolvers(self) -> Dict[Label, Union[Series, MultiIndex]]: return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} @final - def _get_cleaned_column_resolvers(self) -> Dict[Label, Series]: + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: """ Return the special character free column resolvers of a dataframe. @@ -533,6 +532,7 @@ def _get_cleaned_column_resolvers(self) -> Dict[Label, Series]: from pandas.core.computation.parsing import clean_column_name if isinstance(self, ABCSeries): + self = cast("Series", self) return {clean_column_name(self.name): self} return { @@ -1114,7 +1114,7 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): In this case, the parameter ``copy`` is ignored. The second calling convention will modify the names of the - corresponding index if mapper is a list or a scalar. + the corresponding index if mapper is a list or a scalar. However, if mapper is dict-like or a function, it will use the deprecated behavior of modifying the axis *labels*. @@ -1927,11 +1927,6 @@ def __array_wrap__( self, method="__array_wrap__" ) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) - # ideally we would define this to avoid the getattr checks, but # is slower # @property @@ -2029,13 +2024,13 @@ def _repr_data_resource_(self): # I/O Methods @final - @doc(klass="object", storage_options=_shared_docs["storage_options"]) + @doc(klass="object") def to_excel( self, excel_writer, - sheet_name: str = "Sheet1", - na_rep: str = "", - float_format: Optional[str] = None, + sheet_name="Sheet1", + na_rep="", + float_format=None, columns=None, header=True, index=True, @@ -2048,7 +2043,6 @@ def to_excel( inf_rep="inf", verbose=True, freeze_panes=None, - storage_options: StorageOptions = None, ) -> None: """ Write {klass} to an Excel sheet. @@ -2065,7 +2059,7 @@ def to_excel( Parameters ---------- - excel_writer : path-like, file-like, or ExcelWriter object + excel_writer : str or ExcelWriter object File path or existing ExcelWriter. sheet_name : str, default 'Sheet1' Name of sheet which will contain DataFrame. @@ -2106,9 +2100,6 @@ def to_excel( freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. - {storage_options} - - .. versionadded:: 1.2.0 See Also -------- @@ -2183,11 +2174,9 @@ def to_excel( startcol=startcol, freeze_panes=freeze_panes, engine=engine, - storage_options=storage_options, ) @final - @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -2220,27 +2209,27 @@ def to_json( * Series: - default is 'index' - - allowed values are: {{'split', 'records', 'index', 'table'}}. + - allowed values are: {'split', 'records', 'index', 'table'}. * DataFrame: - default is 'columns' - - allowed values are: {{'split', 'records', 'index', 'columns', - 'values', 'table'}}. + - allowed values are: {'split', 'records', 'index', 'columns', + 'values', 'table'}. * The format of the JSON string: - - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], - 'data' -> [values]}} - - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] - - 'index' : dict like {{index -> {{column -> value}}}} - - 'columns' : dict like {{column -> {{index -> value}}}} + - 'split' : dict like {'index' -> [index], 'columns' -> [columns], + 'data' -> [values]} + - 'records' : list like [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + - 'columns' : dict like {column -> {index -> value}} - 'values' : just the values array - - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} + - 'table' : dict like {'schema': {schema}, 'data': {data}} Describing the data, where data component is like ``orient='records'``. - date_format : {{None, 'epoch', 'iso'}} + date_format : {None, 'epoch', 'iso'} Type of date conversion. 'epoch' = epoch milliseconds, 'iso' = ISO8601. The default depends on the `orient`. For ``orient='table'``, the default is 'iso'. For all other orients, @@ -2263,7 +2252,7 @@ def to_json( throw ValueError if incorrect 'orient' since others are not list like. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} A string representing the compression to use in the output file, only used when the first argument is a filename. By default, the @@ -2280,7 +2269,13 @@ def to_json( .. versionadded:: 1.0.0 - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2317,7 +2312,7 @@ def to_json( >>> result = df.to_json(orient="split") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ + { "columns": [ "col 1", "col 2" @@ -2336,7 +2331,7 @@ def to_json( "d" ] ] - }} + } Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. @@ -2345,14 +2340,14 @@ def to_json( >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP [ - {{ + { "col 1": "a", "col 2": "b" - }}, - {{ + }, + { "col 1": "c", "col 2": "d" - }} + } ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: @@ -2360,32 +2355,32 @@ def to_json( >>> result = df.to_json(orient="index") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "row 1": {{ + { + "row 1": { "col 1": "a", "col 2": "b" - }}, - "row 2": {{ + }, + "row 2": { "col 1": "c", "col 2": "d" - }} - }} + } + } Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: >>> result = df.to_json(orient="columns") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "col 1": {{ + { + "col 1": { "row 1": "a", "row 2": "c" - }}, - "col 2": {{ + }, + "col 2": { "row 1": "b", "row 2": "d" - }} - }} + } + } Encoding/decoding a Dataframe using ``'values'`` formatted JSON: @@ -2408,40 +2403,40 @@ def to_json( >>> result = df.to_json(orient="table") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "schema": {{ + { + "schema": { "fields": [ - {{ + { "name": "index", "type": "string" - }}, - {{ + }, + { "name": "col 1", "type": "string" - }}, - {{ + }, + { "name": "col 2", "type": "string" - }} + } ], "primaryKey": [ "index" ], "pandas_version": "0.20.0" - }}, + }, "data": [ - {{ + { "index": "row 1", "col 1": "a", "col 2": "b" - }}, - {{ + }, + { "index": "row 2", "col 1": "c", "col 2": "d" - }} + } ] - }} + } """ from pandas.io import json @@ -2722,7 +2717,7 @@ def to_sql( >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] - An `sqlalchemy.engine.Connection` can also be passed to `con`: + An `sqlalchemy.engine.Connection` can also be passed to to `con`: >>> with engine.begin() as connection: ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) @@ -2780,7 +2775,6 @@ def to_sql( ) @final - @doc(storage_options=_shared_docs["storage_options"]) def to_pickle( self, path, @@ -2795,7 +2789,7 @@ def to_pickle( ---------- path : str File path where the pickled object will be stored. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \ + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. @@ -2807,7 +2801,13 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2820,7 +2820,7 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) + >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 @@ -3185,7 +3185,6 @@ def to_latex( ) @final - @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -3222,7 +3221,7 @@ def to_csv( File path or object, if None is provided the result is returned as a string. If a non-binary file object is passed, it should be opened with `newline=''`, disabling universal newlines. If a binary - file object is passed, `mode` might need to contain a `'b'`. + file object is passed, `mode` needs to contain a `'b'`. .. versionchanged:: 0.24.0 @@ -3265,11 +3264,11 @@ def to_csv( compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following - possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. @@ -3326,7 +3325,13 @@ def to_csv( .. versionadded:: 1.1.0 - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -3343,9 +3348,9 @@ def to_csv( Examples -------- - >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}}) + ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' @@ -3703,23 +3708,18 @@ class animal locomotion return result if axis == 1: - if drop_level: - return self[key] - index = self.columns - else: - index = self.index - - self._consolidate_inplace() + return self[key] + index = self.index if isinstance(index, MultiIndex): try: - loc, new_index = index._get_loc_level( + loc, new_index = self.index._get_loc_level( key, level=0, drop_level=drop_level ) except TypeError as e: raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: - loc = index.get_loc(key) + loc = self.index.get_loc(key) if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: @@ -3729,9 +3729,9 @@ class animal locomotion return self._take_with_is_copy(loc, axis=axis) if not is_scalar(loc): - new_index = index[loc] + new_index = self.index[loc] - if is_scalar(loc) and axis == 0: + if is_scalar(loc): # In this case loc should be an integer if self.ndim == 1: # if we encounter an array-like and we only have 1 dim @@ -3747,10 +3747,7 @@ class animal locomotion name=self.index[loc], dtype=new_values.dtype, ) - elif is_scalar(loc): - result = self.iloc[:, slice(loc, loc + 1)] - elif axis == 1: - result = self.iloc[:, loc] + else: result = self.iloc[loc] result.index = new_index @@ -3774,7 +3771,7 @@ def _get_item_cache(self, item): loc = self.columns.get_loc(item) values = self._mgr.iget(loc) - res = self._box_col_values(values, loc).__finalize__(self) + res = self._box_col_values(values, loc) cache[item] = res res._set_as_cached(item, self) @@ -5490,7 +5487,7 @@ def __setattr__(self, name: str, value) -> None: def _dir_additions(self) -> Set[str]: """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, its first level values are used. + If info_axis is a MultiIndex, it's first level values are used. """ additions = super()._dir_additions() if self._info_axis._can_hold_strings: @@ -6329,8 +6326,6 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) - self._consolidate_inplace() - # set the default here, so functions examining the signaure # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: @@ -6753,8 +6748,6 @@ def replace( if not is_bool(regex) and to_replace is not None: raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") - self._consolidate_inplace() - if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat @@ -9054,6 +9047,7 @@ def _where( cond = -cond if inplace else cond # try to align with other + try_quick = True if isinstance(other, NDFrame): # align with me @@ -9092,11 +9086,12 @@ def _where( # match True cond to other elif len(cond[icond]) == len(other): - # try to not change dtype at first - new_other = np.asarray(self) - new_other = new_other.copy() - new_other[icond] = other - other = new_other + # try to not change dtype at first (if try_quick) + if try_quick: + new_other = np.asarray(self) + new_other = new_other.copy() + new_other[icond] = other + other = new_other else: raise ValueError( @@ -11318,11 +11313,7 @@ def _inplace_method(self, other, op): """ result = op(self, other) - if ( - self.ndim == 1 - and result._indexed_same(self) - and is_dtype_equal(result.dtype, self.dtype) - ): + if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype: # GH#36498 this inplace op can _actually_ be inplace. self._values[:] = result._values return self @@ -11886,7 +11877,7 @@ def _doc_parms(cls): _any_desc = """\ Return whether any element is True, potentially over an axis. -Returns False unless there is at least one element within a series or +Returns False unless there at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty).""" diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 7dc0db35bf8fe..f205226c03a53 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -192,7 +192,6 @@ def _gotitem(self, key, ndim, subset=None): "describe", "dtypes", "expanding", - "ewm", "filter", "get_group", "groups", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 244c47cd1f1ea..6f86819303537 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -262,7 +262,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, - # see test_groupby.test_basic + # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) @@ -1390,7 +1390,8 @@ def _transform_fast(self, result: DataFrame) -> DataFrame: """ obj = self._obj_with_exclusions - # for each col, reshape to size of original frame by take operation + # for each col, reshape to to size of original frame + # by take operation ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) output = [ @@ -1674,16 +1675,11 @@ def _wrap_transformed_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - result = self.obj._constructor(indexed_output) - - if self.axis == 1: - result = result.T - result.columns = self.obj.columns - else: - columns = Index(key.label for key in output) - columns.name = self.obj.columns.name - result.columns = columns + columns = Index(key.label for key in output) + columns.name = self.obj.columns.name + result = self.obj._constructor(indexed_output) + result.columns = columns result.index = self.obj.index return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ae3612c99d5cd..32023576b0a91 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1600,8 +1600,10 @@ def sem(self, ddof: int = 1): cols = result.columns.get_indexer_for( result.columns.difference(self.exclusions).unique() ) - result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( - self.count().iloc[:, cols] + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = ( + result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values ) return result @@ -1669,10 +1671,10 @@ def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): """Helper function for first item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): + x = x.array[notna(x.array)] + if len(x) == 0: return np.nan - return arr[0] + return x[0] if isinstance(obj, DataFrame): return obj.apply(first, axis=axis) @@ -1693,10 +1695,10 @@ def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): """Helper function for last item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): + x = x.array[notna(x.array)] + if len(x) == 0: return np.nan - return arr[-1] + return x[-1] if isinstance(obj, DataFrame): return obj.apply(last, axis=axis) @@ -1857,16 +1859,6 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) - @Substitution(name="groupby") - @Appender(_common_see_also) - def ewm(self, *args, **kwargs): - """ - Return an ewm grouper, providing ewm functionality per group. - """ - from pandas.core.window import ExponentialMovingWindowGroupby - - return ExponentialMovingWindowGroupby(self, *args, **kwargs) - def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -2373,7 +2365,7 @@ def cumcount(self, ascending: bool = True): dtype: int64 """ with group_selection_context(self): - index = self._selected_obj._get_axis(self.axis) + index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) @@ -2714,8 +2706,8 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 fill_method = "pad" limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.codes, axis=self.axis) - shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) + fill_grp = filled.groupby(self.grouper.codes) + shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 @Substitution(name="groupby") @@ -2750,10 +2742,7 @@ def head(self, n=5): """ self._reset_group_selection() mask = self._cumcount_array() < n - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] + return self._selected_obj[mask] @Substitution(name="groupby") @Substitution(see_also=_common_see_also) @@ -2787,10 +2776,7 @@ def tail(self, n=5): """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] + return self._selected_obj[mask] def _reindex_output( self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 50c4cc53a12bb..438030008bb4d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -148,7 +148,7 @@ def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": ------- Generator yielding subsetted objects - __finalize__ has not been called for the subsetted objects returned. + __finalize__ has not been called for the the subsetted objects returned. """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -603,7 +603,8 @@ def _aggregate( ): if agg_func is libgroupby.group_nth: # different signature from the others - agg_func(result, counts, values, comp_ids, min_count, rank=1) + # TODO: should we be using min_count instead of hard-coding it? + agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) else: agg_func(result, counts, values, comp_ids, min_count) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index b6713bc760c5e..e48a42599a2a0 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -105,7 +105,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: return True if arr_value.ndim == 1: if not isinstance(indexer, tuple): - indexer = (indexer,) + indexer = tuple([indexer]) return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c49f3f9457161..cb5641a74e60b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -415,11 +415,6 @@ def asi8(self): ndarray An ndarray with int64 dtype. """ - warnings.warn( - "Index.asi8 is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) return None @classmethod @@ -1486,7 +1481,7 @@ def _get_level_number(self, level) -> int: def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - For internal compatibility with the Index API. + For internal compatibility with with the Index API. Sort the Index. This is for compat with MultiIndex @@ -1575,33 +1570,6 @@ def droplevel(self, level=0): Returns ------- Index or MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) - >>> mi - MultiIndex([(1, 3, 5), - (2, 4, 6)], - names=['x', 'y', 'z']) - - >>> mi.droplevel() - MultiIndex([(3, 5), - (4, 6)], - names=['y', 'z']) - - >>> mi.droplevel(2) - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.droplevel('z') - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.droplevel(['x', 'y']) - Int64Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] @@ -2517,10 +2485,12 @@ def _get_unique_index(self, dropna: bool = False): else: values = self._values - if dropna and not isinstance(self, ABCMultiIndex): - # isna not defined for MultiIndex - if self.hasnans: - values = values[~isna(values)] + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass return self._shallow_copy(values) @@ -2764,7 +2734,7 @@ def _union(self, other, sort): stacklevel=3, ) - return result + return self._shallow_copy(result) @final def _wrap_setop_result(self, other, result): @@ -2772,8 +2742,6 @@ def _wrap_setop_result(self, other, result): result, np.ndarray ): result = type(self._data)._simple_new(result, dtype=self.dtype) - elif is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): - result = Categorical(result, dtype=self.dtype) name = get_op_result_name(self, other) if isinstance(result, Index): @@ -2830,13 +2798,6 @@ def intersection(self, other, sort=False): other = other.astype("O") return this.intersection(other, sort=sort) - result = self._intersection(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _intersection(self, other, sort=False): - """ - intersection specialized to the case with matching dtypes. - """ # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values @@ -2847,7 +2808,7 @@ def _intersection(self, other, sort=False): except TypeError: pass else: - return result + return self._wrap_setop_result(other, result) try: indexer = Index(rvals).get_indexer(lvals) @@ -2863,7 +2824,7 @@ def _intersection(self, other, sort=False): if sort is None: result = algos.safe_sort(result) - return result + return self._wrap_setop_result(other, result) def difference(self, other, sort=None): """ @@ -3202,7 +3163,7 @@ def _get_fill_indexer( indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) - if tolerance is not None and len(self): + if tolerance is not None: indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer @@ -3247,21 +3208,12 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: values that can be subtracted from each other (e.g., not strings or tuples). """ - if not len(self): - return self._get_fill_indexer(target, "pad") - left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) target_values = target._values - # error: Unsupported left operand type for - ("ExtensionArray") - left_distances = np.abs( - self._values[left_indexer] - target_values # type: ignore[operator] - ) - # error: Unsupported left operand type for - ("ExtensionArray") - right_distances = np.abs( - self._values[right_indexer] - target_values # type: ignore[operator] - ) + left_distances = np.abs(self._values[left_indexer] - target_values) + right_distances = np.abs(self._values[right_indexer] - target_values) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3280,8 +3232,7 @@ def _filter_indexer_tolerance( indexer: np.ndarray, tolerance, ) -> np.ndarray: - # error: Unsupported left operand type for - ("ExtensionArray") - distance = abs(self._values[indexer] - target) # type: ignore[operator] + distance = abs(self._values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -3433,11 +3384,11 @@ def _convert_list_indexer(self, keyarr): return None @final - def _invalid_indexer(self, form: str_t, key) -> TypeError: + def _invalid_indexer(self, form: str_t, key): """ Consistent invalid indexer message. """ - return TypeError( + raise TypeError( f"cannot do {form} indexing on {type(self).__name__} with these " f"indexers [{key}] of type {type(key).__name__}" ) @@ -3485,7 +3436,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - values: Union[range, ExtensionArray, np.ndarray] if isinstance(self, ABCRangeIndex): values = range(0) else: @@ -3558,7 +3508,7 @@ def _reindex_non_unique(self, target): cur_labels = self.take(indexer[check]).values cur_indexer = ensure_int64(length[check]) - new_labels = np.empty((len(indexer),), dtype=object) + new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels @@ -4011,11 +3961,7 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return join_index - def _wrap_joined_index( - self: _IndexT, joined: np.ndarray, other: _IndexT - ) -> _IndexT: - assert other.dtype == self.dtype - + def _wrap_joined_index(self, joined, other): if isinstance(self, ABCMultiIndex): name = self.names if self.names == other.names else None else: @@ -4217,7 +4163,7 @@ def _is_memory_usage_qualified(self) -> bool: """ return self.is_object() - def is_type_compatible(self, kind: str_t) -> bool: + def is_type_compatible(self, kind) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4373,9 +4319,11 @@ def putmask(self, mask, value): numpy.ndarray.putmask : Changes elements of an array based on conditional and input values. """ - values = self._values.copy() + values = self.values.copy() try: converted = self._validate_fill_value(value) + np.putmask(values, mask, converted) + return self._shallow_copy(values) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err @@ -4383,9 +4331,6 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - np.putmask(values, mask, converted) - return self._shallow_copy(values) - def equals(self, other: object) -> bool: """ Determine if two Index object are equal. @@ -4451,7 +4396,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False - # If other is a subclass of self and defines its own equals method, we + # If other is a subclass of self and defines it's own equals method, we # dispatch to the subclass method. For instance for a MultiIndex, # a d-level MultiIndex can equal d-tuple Index. # Note: All EA-backed Index subclasses override equals @@ -4583,9 +4528,8 @@ def asof_locs(self, where: "Index", mask) -> np.ndarray: result = np.arange(len(self))[mask].take(locs) - # TODO: overload return type of ExtensionArray.__getitem__ - first_value = cast(Any, self._values[mask.argmax()]) - result[(locs == 0) & (where._values < first_value)] = -1 + first = mask.argmax() + result[(locs == 0) & (where._values < self._values[first])] = -1 return result @@ -4773,13 +4717,12 @@ def argsort(self, *args, **kwargs) -> np.ndarray: >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - if needs_i8_conversion(self.dtype): - # TODO: these do not match the underlying EA argsort methods GH#37863 - return self.asi8.argsort(*args, **kwargs) + result = self.asi8 + + if result is None: + result = np.array(self) - # This works for either ndarray or EA, is overriden - # by RangeIndex, MultIIndex - return self._data.argsort(*args, **kwargs) + return result.argsort(*args, **kwargs) @final def get_value(self, series: "Series", key): @@ -4896,14 +4839,6 @@ def set_value(self, arr, key, value): @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) - - if target.is_boolean() and self.is_numeric(): - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches - pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) @@ -5153,7 +5088,7 @@ def isin(self, values, level=None): """ if level is not None: self._validate_index_level(level) - return algos.isin(self._values, values) + return algos.isin(self, values) def _get_string_slice(self, key: str_t): # this is for partial string indexing, @@ -5238,7 +5173,7 @@ def _validate_indexer(self, form: str_t, key, kind: str_t): elif is_integer(key): pass else: - raise self._invalid_indexer(form, key) + self._invalid_indexer(form, key) def _maybe_cast_slice_bound(self, label, side: str_t, kind): """ @@ -5267,7 +5202,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind): # datetimelike Indexes # reject them, if index does not contain label if (is_float(label) or is_integer(label)) and label not in self.values: - raise self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) return label @@ -5531,17 +5466,6 @@ def _cmp_method(self, other, op): """ Wrapper used to dispatch comparison operations. """ - if self.is_(other): - # fastpath - if op in {operator.eq, operator.le, operator.ge}: - arr = np.ones(len(self), dtype=bool) - if self._can_hold_na and not isinstance(self, ABCMultiIndex): - # TODO: should set MultiIndex._can_hold_na = False? - arr[self.isna()] = False - return arr - elif op in {operator.ne, operator.lt, operator.gt}: - return np.zeros(len(self), dtype=bool) - if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): if len(self) != len(other): raise ValueError("Lengths must match to compare") diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e2507aeaeb652..24bd60a7356dd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import Any, List import warnings import numpy as np @@ -6,6 +6,7 @@ from pandas._config import get_option from pandas._libs import index as libindex +from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Label from pandas.util._decorators import Appender, cache_readonly, doc @@ -13,7 +14,10 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_categorical_dtype, + is_interval_dtype, + is_list_like, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna @@ -220,25 +224,16 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._cache = {} result._reset_identity() + result._no_setting_name = False return result # -------------------------------------------------------------------- - # error: Argument 1 of "_shallow_copy" is incompatible with supertype - # "ExtensionIndex"; supertype defines the argument type as - # "Optional[ExtensionArray]" [override] @doc(Index._shallow_copy) - def _shallow_copy( # type:ignore[override] - self, - values: Optional[Categorical] = None, - name: Label = no_default, - ): + def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is not None: - # In tests we only get here with Categorical objects that - # have matching .ordered, and values.categories a subset of - # our own. However we do _not_ have a dtype match in general. values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name) @@ -250,10 +245,6 @@ def _is_dtype_compat(self, other) -> Categorical: provide a comparison between the dtype of self and other (coercing if needed) - Parameters - ---------- - other : Index - Returns ------- Categorical @@ -270,6 +261,8 @@ def _is_dtype_compat(self, other) -> Categorical: ) else: values = other + if not is_list_like(values): + values = [values] cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) @@ -363,6 +356,11 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. @@ -373,8 +371,20 @@ def __contains__(self, key: Any) -> bool: @doc(Index.astype) def astype(self, dtype, copy=True): - res_data = self._data.astype(dtype, copy=copy) - return Index(res_data, name=self.name) + if dtype is not None: + dtype = pandas_dtype(dtype) + + if is_interval_dtype(dtype): + from pandas import IntervalIndex + + return IntervalIndex(np.array(self)) + elif is_categorical_dtype(dtype): + # GH 18630 + dtype = self.dtype.update_dtype(dtype) + if dtype == self.dtype: + return self.copy() if copy else self + + return Index.astype(self, dtype=dtype, copy=copy) @doc(Index.fillna) def fillna(self, value, downcast=None): @@ -399,10 +409,27 @@ def unique(self, level=None): # of result, not self. return type(self)._simple_new(result, name=self.name) + @doc(Index.duplicated) + def duplicated(self, keep="first"): + codes = self.codes.astype("i8") + return duplicated_int64(codes, keep) + def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype("object") + @doc(Index.where) + def where(self, cond, other=None): + # TODO: Investigate an alternative implementation with + # 1. copy the underlying Categorical + # 2. setitem with `cond` and `other` + # 3. Rebuild CategoricalIndex. + if other is None: + other = self._na_value + values = np.where(cond, self._values, other) + cat = Categorical(values, dtype=self.dtype) + return type(self)._simple_new(cat, name=self.name) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -464,8 +491,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): - new_target = Categorical(new_target, dtype=target.dtype) - new_target = type(self)._simple_new(new_target, name=self.name) + new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -488,8 +514,7 @@ def _reindex_non_unique(self, target): if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories - new_target = Categorical(new_target, dtype=self.dtype) - new_target = type(self)._simple_new(new_target, name=self.name) + new_target = self._shallow_copy(new_target) return new_target, indexer, new_indexer @@ -504,38 +529,53 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase.ensure_index(target) - self._check_indexing_method(method) - if self.is_unique and self.equals(target): return np.arange(len(self), dtype="intp") - return self._get_indexer_non_unique(target._values)[0] + if method == "pad" or method == "backfill": + raise NotImplementedError( + "method='pad' and method='backfill' not " + "implemented yet for CategoricalIndex" + ) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet for CategoricalIndex" + ) + + # Note: we use engine.get_indexer_non_unique below because, even if + # `target` is unique, any non-category entries in it will be encoded + # as -1 by _get_codes_for_get_indexer, so `codes` may not be unique. + codes = self._get_codes_for_get_indexer(target._values) + indexer, _ = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer) @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - return self._get_indexer_non_unique(target._values) - def _get_indexer_non_unique(self, values: ArrayLike): + codes = self._get_codes_for_get_indexer(target._values) + indexer, missing = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer), missing + + def _get_codes_for_get_indexer(self, target: ArrayLike) -> np.ndarray: """ - get_indexer_non_unique but after unrapping the target Index object. + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. """ - # Note: we use engine.get_indexer_non_unique for get_indexer in addition - # to get_indexer_non_unique because, even if `target` is unique, any - # non-category entries in it will be encoded as -1 so `codes` may - # not be unique. - if isinstance(values, Categorical): + if isinstance(target, Categorical): # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - cat = self._data._encode_with_my_categories(values) + cat = self._data._encode_with_my_categories(target) codes = cat._codes else: - codes = self.categories.get_indexer(values) + codes = self.categories.get_indexer(target) - indexer, missing = self._engine.get_indexer_non_unique(codes) - return ensure_platform_int(indexer), missing + return codes @doc(Index._convert_list_indexer) def _convert_list_indexer(self, keyarr): @@ -543,11 +583,16 @@ def _convert_list_indexer(self, keyarr): # the categories if self.categories._defer_to_indexing: - # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - return self.get_indexer_for(keyarr) + indexer = self.categories.get_indexer(np.asarray(keyarr)) + if (indexer == -1).any(): + raise KeyError( + "a list-indexer must only include values that are in the categories" + ) + + return self.get_indexer(keyarr) @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side: str, kind): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1b18f04ba603d..40a6086f69f85 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,7 +2,7 @@ Base and utility classes for tseries type pandas objects. """ from datetime import datetime -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, TypeVar, Union, cast +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, TypeVar, Union, cast import numpy as np @@ -10,6 +10,7 @@ from pandas._libs.tslibs import BaseOffset, Resolution, Tick from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( @@ -22,10 +23,12 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.base import IndexOpsMixin import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs @@ -53,22 +56,16 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): # error: 'staticmethod' used with a non-method @staticmethod # type: ignore[misc] def wrapper(left, right): - # Note: these only get called with left.dtype == right.dtype - if isinstance( - left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) - ): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): left = left.view("i8") - if isinstance( - right, - (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), - ): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): right = right.view("i8") results = joinf(left, right) if with_indexers: # dtype should be timedelta64[ns] for TimedeltaIndex # and datetime64[ns] for DatetimeIndex - dtype = cast(np.dtype, left.dtype).base + dtype = left.dtype.base join_index, left_indexer, right_indexer = results join_index = join_index.view(dtype) @@ -91,7 +88,6 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): _can_hold_strings = False _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] - _data_cls: Union[Type[DatetimeArray], Type[TimedeltaArray], Type[PeriodArray]] freq: Optional[BaseOffset] freqstr: Optional[str] _resolution_obj: Resolution @@ -104,25 +100,6 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): ) _hasnans = hasnans # for index / array -agnostic code - @classmethod - def _simple_new( - cls, - values: Union[DatetimeArray, TimedeltaArray, PeriodArray], - name: Label = None, - ): - assert isinstance(values, cls._data_cls), type(values) - - result = object.__new__(cls) - result._data = values - result._name = name - result._cache = {} - - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - - result._reset_identity() - return result - @property def _is_all_dates(self) -> bool: return True @@ -163,8 +140,16 @@ def equals(self, other: object) -> bool: elif other.dtype.kind in ["f", "i", "u", "c"]: return False elif not isinstance(other, type(self)): + inferrable = [ + "timedelta", + "timedelta64", + "datetime", + "datetime64", + "date", + "period", + ] + should_try = False - inferrable = self._data._infer_matches if other.dtype == object: should_try = other.inferred_type in inferrable elif is_categorical_dtype(other.dtype): @@ -213,6 +198,10 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): result._data._freq = freq return result + @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") + def searchsorted(self, value, side="left", sorter=None): + return self._data.searchsorted(value, side=side, sorter=sorter) + _can_hold_na = True _na_value = NaT @@ -248,23 +237,23 @@ def min(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - - if len(i8) and self.is_monotonic_increasing: + try: # quick check - if i8[0] != iNaT: - return self._data._box_func(i8[0]) - - if self.hasnans: - if not skipna: - return self._na_value - i8 = i8[~self._isnan] - - if not len(i8): + if len(i8) and self.is_monotonic: + if i8[0] != iNaT: + return self._data._box_func(i8[0]) + + if self.hasnans: + if skipna: + min_stamp = self[~self._isnan].asi8.min() + else: + return self._na_value + else: + min_stamp = i8.min() + return self._data._box_func(min_stamp) + except ValueError: return self._na_value - min_stamp = i8.min() - return self._data._box_func(min_stamp) - def argmin(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the minimum values along an axis. @@ -305,23 +294,23 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - - if len(i8) and self.is_monotonic: + try: # quick check - if i8[-1] != iNaT: - return self._data._box_func(i8[-1]) - - if self.hasnans: - if not skipna: - return self._na_value - i8 = i8[~self._isnan] - - if not len(i8): + if len(i8) and self.is_monotonic: + if i8[-1] != iNaT: + return self._data._box_func(i8[-1]) + + if self.hasnans: + if skipna: + max_stamp = self[~self._isnan].asi8.max() + else: + return self._na_value + else: + max_stamp = i8.max() + return self._data._box_func(max_stamp) + except ValueError: return self._na_value - max_stamp = i8.max() - return self._data._box_func(max_stamp) - def argmax(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the maximum values along an axis. @@ -380,7 +369,7 @@ def _format_with_header( @property def _formatter_func(self): - return self._data._formatter() + raise AbstractMethodError(self) def _format_attrs(self): """ @@ -395,36 +384,6 @@ def _format_attrs(self): attrs.append(("freq", freq)) return attrs - def _summary(self, name=None) -> str: - """ - Return a summarized representation. - - Parameters - ---------- - name : str - Name to use in the summary representation. - - Returns - ------- - str - Summarized representation of the index. - """ - formatter = self._formatter_func - if len(self) > 0: - index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" - else: - index_summary = "" - - if name is None: - name = type(self).__name__ - result = f"{name}: {len(self)} entries{index_summary}" - if self.freq: - result += f"\nFreq: {self.freqstr}" - - # display as values, not quoted - result = result.replace("'", "") - return result - # -------------------------------------------------------------------- # Indexing Methods @@ -455,7 +414,7 @@ def _partial_date_slice( vals = self._data._ndarray unbox = self._data._unbox - if self.is_monotonic_increasing: + if self.is_monotonic: if len(self) and ( (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) @@ -497,6 +456,68 @@ def _partial_date_slice( __truediv__ = make_wrapped_arith_op("__truediv__") __rtruediv__ = make_wrapped_arith_op("__rtruediv__") + def isin(self, values, level=None): + """ + Compute boolean array of whether each index value is found in the + passed set of values. + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + if level is not None: + self._validate_index_level(level) + + if not isinstance(values, type(self)): + try: + values = type(self)(values) + except ValueError: + return self.astype(object).isin(values) + + return algorithms.isin(self.asi8, values.asi8) + + @Appender(Index.where.__doc__) + def where(self, cond, other=None): + other = self._data._validate_setitem_value(other) + + result = np.where(cond, self._data._ndarray, other) + arr = self._data._from_backing_data(result) + return type(self)._simple_new(arr, name=self.name) + + def _summary(self, name=None) -> str: + """ + Return a summarized representation. + + Parameters + ---------- + name : str + Name to use in the summary representation. + + Returns + ------- + str + Summarized representation of the index. + """ + formatter = self._formatter_func + if len(self) > 0: + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + result = f"{name}: {len(self)} entries{index_summary}" + if self.freq: + result += f"\nFreq: {self.freqstr}" + + # display as values, not quoted + result = result.replace("'", "") + return result + def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -641,13 +662,15 @@ def _with_freq(self, freq): arr = self._data._with_freq(freq) return type(self)._simple_new(arr, name=self.name) - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return False + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + + if values is not None: + return self._simple_new(values, name=name) - def is_type_compatible(self, kind: str) -> bool: - return kind in self._data._infer_matches + result = self._simple_new(self._data, name=name) + result._cache = self._cache + return result # -------------------------------------------------------------------- # Set Operation Methods @@ -722,14 +745,15 @@ def intersection(self, other, sort=False): start = right[0] if end < start: - result = self[:0] + # pandas\core\indexes\datetimelike.py:758: error: Unexpected + # keyword argument "freq" for "DatetimeTimedeltaMixin" [call-arg] + result = type(self)( + data=[], dtype=self.dtype, freq=self.freq # type: ignore[call-arg] + ) else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left._values[lslice] - # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has - # incompatible type "Union[ExtensionArray, Any]"; expected - # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" [arg-type] - result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] + result = type(self)._simple_new(left_chunk) return self._wrap_setop_result(other, result) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f6eeb121b1ac0..9744eb0ecbb88 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -14,14 +14,17 @@ to_offset, ) from pandas._libs.tslibs.offsets import prefix_mapping -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_float, + is_integer, is_scalar, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype @@ -217,7 +220,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _typ = "datetimeindex" - _data_cls = DatetimeArray _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True @@ -317,6 +319,20 @@ def __new__( subarr = cls._simple_new(dtarr, name=name) return subarr + @classmethod + def _simple_new(cls, values: DatetimeArray, name: Label = None): + assert isinstance(values, DatetimeArray), type(values) + + result = object.__new__(cls) + result._data = values + result.name = name + result._cache = {} + result._no_setting_name = False + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + result._reset_identity() + return result + # -------------------------------------------------------------------- @cache_readonly @@ -351,6 +367,8 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ + if not is_datetime64_any_dtype(dtype): + return False if self.tz is not None: # If we have tz, we can compare to tzaware return is_datetime64tz_dtype(dtype) @@ -369,7 +387,7 @@ def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 formatter = get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: f"'{formatter(x)}'" + return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- # Set Operation Methods @@ -715,13 +733,12 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): """ assert kind in ["loc", "getitem", None] + if is_float(label) or isinstance(label, time) or is_integer(label): + self._invalid_indexer("slice", label) + if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - try: - parsed, reso = parsing.parse_time_string(label, freq) - except parsing.DateParseError as err: - raise self._invalid_indexer("slice", label) from err - + parsed, reso = parsing.parse_time_string(label, freq) reso = Resolution.from_attrname(reso) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -735,9 +752,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return lower if side == "left" else upper elif isinstance(label, (self._data._recognized_scalars, date)): self._deprecate_mismatched_indexing(label) - else: - raise self._invalid_indexer("slice", label) - return self._maybe_cast_for_get_loc(label) def _get_string_slice(self, key: str): @@ -789,25 +803,14 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): end is None or isinstance(end, str) ): mask = np.array(True) - deprecation_mask = np.array(True) if start is not None: start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self - deprecation_mask = start_casted == self if end is not None: end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask - deprecation_mask = (end_casted == self) | deprecation_mask - - if not deprecation_mask.any(): - warnings.warn( - "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is deprecated and will raise a " - "KeyError in a future Version.", - FutureWarning, - stacklevel=5, - ) + indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): return slice(None) @@ -818,6 +821,9 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "datetime" + @property def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 3f146e273326c..3103c27b35d74 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -1,12 +1,10 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ -from typing import List, Optional, TypeVar +from typing import List, TypeVar import numpy as np -from pandas._libs import lib -from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -213,24 +211,6 @@ class ExtensionIndex(Index): __le__ = _make_wrapped_comparison_op("__le__") __ge__ = _make_wrapped_comparison_op("__ge__") - @doc(Index._shallow_copy) - def _shallow_copy( - self, values: Optional[ExtensionArray] = None, name: Label = lib.no_default - ): - name = self.name if name is lib.no_default else name - - if values is not None: - return self._simple_new(values, name=name) - - result = self._simple_new(self._data, name=name) - result._cache = self._cache - return result - - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - # --------------------------------------------------------------------- # NDarray-Like Methods @@ -248,34 +228,18 @@ def __getitem__(self, key): deprecate_ndim_indexing(result) return result - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: - # overriding IndexOpsMixin improves performance GH#38083 - return self._data.searchsorted(value, side=side, sorter=sorter) - # --------------------------------------------------------------------- - def _check_indexing_method(self, method): - """ - Raise if we have a get_indexer `method` that is not supported or valid. - """ - # GH#37871 for now this is only for IntervalIndex and CategoricalIndex - if method is None: - return - - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for {type(self).__name__}" - ) - - raise ValueError("Invalid fill method") - def _get_engine_target(self) -> np.ndarray: - return np.asarray(self._data) + # NB: _values_for_argsort happens to match the desired engine targets + # for all of our existing EA-backed indexes, but in general + # cannot be relied upon to exist. + return self._data._values_for_argsort() def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) result = self._data.repeat(repeats, axis=axis) - return type(self)._simple_new(result, name=self.name) + return self._shallow_copy(result) def insert(self, loc: int, item): # ExtensionIndex subclasses must override Index.insert @@ -342,9 +306,6 @@ class NDArrayBackedExtensionIndex(ExtensionIndex): _data: NDArrayBackedExtensionArray - def _get_engine_target(self) -> np.ndarray: - return self._data._ndarray - def delete(self, loc): """ Make new Index with passed location(-s) deleted @@ -382,19 +343,16 @@ def insert(self, loc: int, item): new_arr = arr._from_backing_data(new_vals) return type(self)._simple_new(new_arr, name=self.name) - @doc(Index.where) - def where(self, cond, other=None): - res_values = self._data.where(cond, other) - return type(self)._simple_new(res_values, name=self.name) - def putmask(self, mask, value): - res_values = self._data.copy() try: - res_values.putmask(mask, value) + value = self._data._validate_setitem_value(value) except (TypeError, ValueError): return self.astype(object).putmask(mask, value) - return type(self)._simple_new(res_values, name=self.name) + new_values = self._data._ndarray.copy() + np.putmask(new_values, mask, value) + new_arr = self._data._from_backing_data(new_values) + return type(self)._simple_new(new_arr, name=self.name) def _wrap_joined_index(self: _T, joined: np.ndarray, other: _T) -> _T: name = get_op_result_name(self, other) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ed92b3dade6a0..2aec86c9cdfae 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -130,13 +130,19 @@ def wrapped(self, other, sort=False): if op_name in ("difference",): result = result.astype(self.dtype) return result + elif self.closed != other.closed: + raise ValueError( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side" + ) - if self._is_non_comparable_own_type(other): - # GH#19016: ensure set op will not return a prohibited dtype + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): raise TypeError( - "can only do set operations between two IntervalIndex " - "objects that are closed on the same side " - "and have compatible dtypes" + f"can only do {op_name} between two IntervalIndex " + "objects that have compatible dtypes" ) return method(self, other, sort) @@ -233,6 +239,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result._data = array result.name = name result._cache = {} + result._no_setting_name = False result._reset_identity() return result @@ -320,6 +327,19 @@ def from_tuples( # -------------------------------------------------------------------- + @Appender(Index._shallow_copy.__doc__) + def _shallow_copy( + self, values: Optional[IntervalArray] = None, name: Label = lib.no_default + ): + name = self.name if name is lib.no_default else name + + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache + return result + @cache_readonly def _engine(self): left = self._maybe_convert_i8(self.left) @@ -360,6 +380,11 @@ def values(self) -> IntervalArray: """ return self._data + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result @@ -373,7 +398,9 @@ def __reduce__(self): def astype(self, dtype, copy: bool = True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) - return Index(new_values, dtype=new_values.dtype, name=self.name) + if is_interval_dtype(new_values.dtype): + return self._shallow_copy(new_values) + return Index.astype(self, dtype, copy=copy) @property def inferred_type(self) -> str: @@ -479,7 +506,7 @@ def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An - Interval-like requires conversion if its endpoints are one of the + Interval-like requires conversion if it's endpoints are one of the aforementioned types. Assumes that any list-like data has already been cast to an Index. @@ -501,7 +528,7 @@ def _needs_i8_conversion(self, key) -> bool: def _maybe_convert_i8(self, key): """ - Maybe convert a given key to its equivalent i8 value(s). Used as a + Maybe convert a given key to it's equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. @@ -540,7 +567,7 @@ def _maybe_convert_i8(self, key): # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: - # convert NaT from its i8 value to np.nan so it's not viewed + # convert NaT from it's i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) @@ -555,6 +582,17 @@ def _maybe_convert_i8(self, key): return key_i8 + def _check_method(self, method): + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for IntervalIndex" + ) + + raise ValueError("Invalid fill method") + def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: raise KeyError( @@ -625,7 +663,7 @@ def get_loc( >>> index.get_loc(pd.Interval(0, 1)) 0 """ - self._check_indexing_method(method) + self._check_method(method) if not is_scalar(key): raise InvalidIndexError(key) @@ -676,7 +714,7 @@ def get_indexer( tolerance: Optional[Any] = None, ) -> np.ndarray: - self._check_indexing_method(method) + self._check_method(method) if self.is_overlapping: raise InvalidIndexError( @@ -691,8 +729,11 @@ def get_indexer( if self.equals(target_as_index): return np.arange(len(self), dtype="intp") - if self._is_non_comparable_own_type(target_as_index): - # different closed or incompatible subtype -> no matches + # different closed or incompatible subtype -> no matches + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): return np.repeat(np.intp(-1), len(target_as_index)) # non-overlapping -> at most one match per interval in target_as_index @@ -712,7 +753,17 @@ def get_indexer( indexer = self._engine.get_indexer(target_as_index.values) else: # heterogeneous scalar index: defer elementwise to get_loc - return self._get_indexer_pointwise(target_as_index)[0] + # (non-overlapping so get_loc guarantees scalar of KeyError) + indexer = [] + for key in target_as_index: + try: + loc = self.get_loc(key) + except KeyError: + loc = -1 + except InvalidIndexError as err: + # i.e. non-scalar key + raise TypeError(key) from err + indexer.append(loc) return ensure_platform_int(indexer) @@ -724,8 +775,10 @@ def get_indexer_non_unique( # check that target_as_index IntervalIndex is compatible if isinstance(target_as_index, IntervalIndex): - - if self._is_non_comparable_own_type(target_as_index): + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): # different closed or incompatible subtype -> no matches return ( np.repeat(-1, len(target_as_index)), @@ -736,8 +789,18 @@ def get_indexer_non_unique( target_as_index, IntervalIndex ): # target_as_index might contain intervals: defer elementwise to get_loc - return self._get_indexer_pointwise(target_as_index) - + indexer, missing = [], [] + for i, key in enumerate(target_as_index): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + indexer.append(locs) + indexer = np.concatenate(indexer) else: target_as_index = self._maybe_convert_i8(target_as_index) indexer, missing = self._engine.get_indexer_non_unique( @@ -746,30 +809,6 @@ def get_indexer_non_unique( return ensure_platform_int(indexer), ensure_platform_int(missing) - def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]: - """ - pointwise implementation for get_indexer and get_indexer_non_unique. - """ - indexer, missing = [], [] - for i, key in enumerate(target): - try: - locs = self.get_loc(key) - if isinstance(locs, slice): - # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err - - indexer.append(locs) - - indexer = np.concatenate(indexer) - return ensure_platform_int(indexer), ensure_platform_int(missing) - @property def _index_as_unique(self): return not self.is_overlapping @@ -806,20 +845,10 @@ def _convert_list_indexer(self, keyarr): # we have missing values if (locs == -1).any(): - raise KeyError(keyarr[locs == -1].tolist()) + raise KeyError return locs - def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: - # different closed or incompatible subtype -> no matches - - # TODO: once closed is part of IntervalDtype, we can just define - # is_comparable_dtype GH#19371 - if self.closed != other.closed: - return True - common_subtype = find_common_type([self.dtype.subtype, other.dtype.subtype]) - return is_object_dtype(common_subtype) - # -------------------------------------------------------------------- @cache_readonly @@ -838,22 +867,6 @@ def mid(self): def length(self): return Index(self._data.length, copy=False) - def putmask(self, mask, value): - arr = self._data.copy() - try: - value_left, value_right = arr._validate_setitem_value(value) - except (ValueError, TypeError): - return self.astype(object).putmask(mask, value) - - if isinstance(self._data._left, np.ndarray): - np.putmask(arr._left, mask, value_left) - np.putmask(arr._right, mask, value_right) - else: - # TODO: special case not needed with __array_function__ - arr._left.putmask(mask, value_left) - arr._right.putmask(mask, value_right) - return type(self)._simple_new(arr, name=self.name) - @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: @@ -872,8 +885,8 @@ def delete(self, loc): """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) - result = self._data._shallow_copy(new_left, new_right) - return type(self)._simple_new(result, name=self.name) + result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) + return self._shallow_copy(result) def insert(self, loc, item): """ @@ -894,8 +907,8 @@ def insert(self, loc, item): new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) - result = self._data._shallow_copy(new_left, new_right) - return type(self)._simple_new(result, name=self.name) + result = IntervalArray.from_arrays(new_left, new_right, closed=self.closed) + return self._shallow_copy(result) # -------------------------------------------------------------------- # Rendering Methods @@ -953,6 +966,11 @@ def _format_space(self) -> str: space = " " * (len(type(self).__name__) + 1) return f"\n{space}" + # -------------------------------------------------------------------- + + def argsort(self, *args, **kwargs) -> np.ndarray: + return np.lexsort((self.right, self.left)) + # -------------------------------------------------------------------- # Set Operations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9b4b459d9a122..5a3f2b0853c4f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -893,15 +893,6 @@ def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.nlevels - 3 """ return len(self._levels) @@ -909,15 +900,6 @@ def nlevels(self) -> int: def levshape(self): """ A tuple with the length of each level. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.levshape - (1, 1, 1) """ return tuple(len(x) for x in self.levels) @@ -1063,7 +1045,7 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): def _engine(self): # Calculate the number of bits needed to represent labels in each # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] @@ -1083,19 +1065,34 @@ def _engine(self): @property def _constructor(self): - return type(self).from_tuples + return MultiIndex.from_tuples @doc(Index._shallow_copy) - def _shallow_copy(self, values=None, name=lib.no_default): - names = name if name is not lib.no_default else self.names + def _shallow_copy( + self, + values=None, + name=lib.no_default, + levels=None, + codes=None, + sortorder=None, + names=lib.no_default, + ): + if names is not lib.no_default and name is not lib.no_default: + raise TypeError("Can only provide one of `names` and `name`") + elif names is lib.no_default: + names = name if name is not lib.no_default else self.names if values is not None: - return type(self).from_tuples(values, sortorder=None, names=names) + assert levels is None and codes is None + return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) - result = type(self)( - levels=self.levels, - codes=self.codes, - sortorder=None, + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + result = MultiIndex( + levels=levels, + codes=codes, + sortorder=sortorder, names=names, verify_integrity=False, ) @@ -1103,6 +1100,18 @@ def _shallow_copy(self, values=None, name=lib.no_default): result._cache.pop("levels", None) # GH32669 return result + def symmetric_difference(self, other, result_name=None, sort=None): + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + tups = Index.symmetric_difference(self, other, result_name, sort) + if len(tups) == 0: + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=tups.name, + ) + return type(self).from_tuples(tups, names=tups.name) + # -------------------------------------------------------------------- def copy( @@ -1168,18 +1177,12 @@ def copy( if codes is None: codes = deepcopy(self.codes) - levels = levels if levels is not None else self.levels - codes = codes if codes is not None else self.codes - - new_index = type(self)( + new_index = self._shallow_copy( levels=levels, codes=codes, - sortorder=self.sortorder, names=names, - verify_integrity=False, + sortorder=self.sortorder, ) - new_index._cache = self._cache.copy() - new_index._cache.pop("levels", None) # GH32669 if dtype: warnings.warn( @@ -1217,10 +1220,10 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ - def f(level): - return "mixed" in level or "string" in level or "unicode" in level + def f(l): + return "mixed" in l or "string" in l or "unicode" in l - return any(f(level) for level in self._inferred_type_levels) + return any(f(l) for l in self._inferred_type_levels) @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: @@ -1454,22 +1457,7 @@ def _set_names(self, names, level=None, validate=True): self._reset_cache() names = property( - fset=_set_names, - fget=_get_names, - doc=""" - Names of levels in MultiIndex. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) - >>> mi - MultiIndex([(1, 3, 5), - (2, 4, 6)], - names=['x', 'y', 'z']) - >>> mi.names - FrozenList(['x', 'y', 'z']) - """, + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) # -------------------------------------------------------------------- @@ -1713,32 +1701,6 @@ def to_frame(self, index=True, name=None): -------- DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) - >>> mi - MultiIndex([('a', 'c'), - ('b', 'd')], - ) - - >>> df = mi.to_frame() - >>> df - 0 1 - a c a c - b d b d - - >>> df = mi.to_frame(index=False) - >>> df - 0 1 - 0 a c - 1 b d - - >>> df = mi.to_frame(name=['x', 'y']) - >>> df - x y - a c a c - b d b d """ from pandas import DataFrame @@ -2143,7 +2105,7 @@ def drop(self, codes, level=None, errors="raise"): Parameters ---------- codes : array-like - Must be a list of tuples when level is not specified + Must be a list of tuples level : int or level name, default None errors : str, default 'raise' @@ -2194,17 +2156,10 @@ def _drop_from_level(self, codes, level, errors="raise"): i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) - # If nan should be dropped it will equal -1 here. We have to check which values - # are not nan and equal -1, this means they are missing in the index - nan_codes = isna(codes) - values[(np.equal(nan_codes, False)) & (values == -1)] = -2 - if index.shape[0] == self.shape[0]: - values[np.equal(nan_codes, True)] = -2 - - not_found = codes[values == -2] - if len(not_found) != 0 and errors != "ignore": - raise KeyError(f"labels {not_found} not found in level") + mask = ~algos.isin(self.codes[i], values) + if mask.all() and errors != "ignore": + raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2279,24 +2234,6 @@ def reorder_levels(self, order): Returns ------- MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) - >>> mi - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.reorder_levels(order=[1, 0]) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) - - >>> mi.reorder_levels(order=['y', 'x']) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: @@ -2314,7 +2251,7 @@ def reorder_levels(self, order): def _get_codes_for_sorting(self): """ - we are categorizing our codes by using the + we categorizing our codes by using the available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not @@ -2355,34 +2292,6 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Resulting index. indexer : np.ndarray Indices of output values in original index. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) - >>> mi - MultiIndex([(0, 2), - (0, 1)], - ) - - >>> mi.sortlevel() - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(sort_remaining=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) - - >>> mi.sortlevel(1) - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(1, ascending=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) """ if isinstance(level, (str, int)): level = [level] @@ -2767,17 +2676,9 @@ def _partial_tup_index(self, tup, side="left"): return start + section.searchsorted(loc, side=side) idx = self._get_loc_single_level_index(lev, lab) - if isinstance(idx, slice) and k < n - 1: - # Get start and end value from slice, necessary when a non-integer - # interval is given as input GH#37707 - start = idx.start - end = idx.stop - elif k < n - 1: + if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") - elif isinstance(idx, slice): - idx = idx.start - return start + section.searchsorted(idx, side=side) else: return start + section.searchsorted(idx, side=side) @@ -3113,8 +3014,6 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): start = 0 if key.stop is not None: stop = level_index.get_loc(key.stop) - elif isinstance(start, slice): - stop = len(level_index) else: stop = len(level_index) - 1 step = key.step @@ -3149,27 +3048,22 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - idx = self._get_loc_single_level_index(level_index, key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.array(level_codes == code, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - if isinstance(idx, slice): - start = idx.start - end = idx.stop - else: - start = level_codes.searchsorted(idx, side="left") - end = level_codes.searchsorted(idx, side="right") - - if start == end: + i = level_codes.searchsorted(code, side="left") + j = level_codes.searchsorted(code, side="right") + if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) - return slice(start, end) + return slice(i, j) def get_locs(self, seq): """ @@ -3234,26 +3128,19 @@ def _convert_to_indexer(r) -> Int64Index: r = r.nonzero()[0] return Int64Index(r) - def _update_indexer( - idxr: Optional[Index], indexer: Optional[Index], key - ) -> Index: + def _update_indexer(idxr: Optional[Index], indexer: Optional[Index]) -> Index: if indexer is None: indexer = Index(np.arange(n)) if idxr is None: return indexer - indexer_intersection = indexer.intersection(idxr) - if indexer_intersection.empty and not idxr.empty and not indexer.empty: - raise KeyError(key) - return indexer_intersection + return indexer.intersection(idxr) for i, k in enumerate(seq): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer( - _convert_to_indexer(k), indexer=indexer, key=seq - ) + indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) elif is_list_like(k): # a collection of labels to include from this level (these @@ -3265,7 +3152,7 @@ def _update_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) indexers = (idxrs if indexers is None else indexers).union( - idxrs, sort=False + idxrs ) except KeyError: @@ -3273,14 +3160,14 @@ def _update_indexer( continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer, key=seq) + indexer = _update_indexer(indexers, indexer=indexer) else: # no matches we are done return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer, key=seq) + indexer = _update_indexer(None, indexer=indexer) elif isinstance(k, slice): @@ -3290,7 +3177,6 @@ def _update_indexer( self._get_level_indexer(k, level=i, indexer=indexer) ), indexer=indexer, - key=seq, ) else: # a single label @@ -3299,7 +3185,6 @@ def _update_indexer( self.get_loc_level(k, level=i, drop_level=False)[0] ), indexer=indexer, - key=seq, ) # empty indexer @@ -3352,9 +3237,6 @@ def _reorder_indexer( # order they appears in a list-like sequence # This mapping is then use to reorder the indexer for i, k in enumerate(seq): - if is_scalar(k): - # GH#34603 we want to treat a scalar the same as an all equal list - k = [k] if com.is_bool_indexer(k): new_order = np.arange(n)[indexer] elif is_list_like(k): @@ -3368,9 +3250,6 @@ def _reorder_indexer( key_order_map[level_indexer] = np.arange(len(level_indexer)) new_order = key_order_map[self.codes[i][indexer]] - elif isinstance(k, slice) and k.start is None and k.stop is None: - # slice(None) should not determine order GH#31330 - new_order = np.ones((n,))[indexer] else: # For all other case, use the same order as the level new_order = np.arange(n)[indexer] @@ -3429,19 +3308,21 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False - if len(self) != len(other): - return False - if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): # other cannot contain tuples, so cannot match self return False + elif len(self) != len(other): + return False return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False + if len(self) != len(other): + return False + for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] @@ -3729,18 +3610,6 @@ def _convert_can_do_setop(self, other): return other, result_names - def symmetric_difference(self, other, result_name=None, sort=None): - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - tups = Index.symmetric_difference(self, other, result_name, sort) - if len(tups) == 0: - return type(self)( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=tups.name, - ) - return type(self).from_tuples(tups, names=tups.name) - # -------------------------------------------------------------------- @doc(Index.astype) @@ -3758,7 +3627,7 @@ def astype(self, dtype, copy=True): return self._shallow_copy() return self - def _validate_fill_value(self, item): + def _validate_insert_value(self, item): if not isinstance(item, tuple): # Pad the key with empty strings if lower levels of the key # aren't specified: @@ -3781,7 +3650,7 @@ def insert(self, loc: int, item): ------- new_index : Index """ - item = self._validate_fill_value(item) + item = self._validate_insert_value(item) new_levels = [] new_codes = [] @@ -3791,12 +3660,7 @@ def insert(self, loc: int, item): # must insert at end otherwise you have to recompute all the # other codes lev_loc = len(level) - try: - level = level.insert(lev_loc, k) - except TypeError: - # TODO: Should this be done inside insert? - # TODO: smarter casting rules? - level = level.astype(object).insert(lev_loc, k) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 12f61fc44582d..9eb8a8b719d41 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,11 +1,11 @@ +import operator from typing import Any -import warnings import numpy as np from pandas._libs import index as libindex, lib from pandas._typing import Dtype, Label -from pandas.util._decorators import doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -26,6 +26,7 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name @@ -120,14 +121,8 @@ def _validate_fill_value(self, value): # force conversion to object # so we don't lose the bools raise TypeError - elif isinstance(value, str) or lib.is_complex(value): + if isinstance(value, str): raise TypeError - elif is_scalar(value) and isna(value): - if is_valid_nat_for_dtype(value, self.dtype): - value = self._na_value - else: - # NaT, np.datetime64("NaT"), np.timedelta64("NaT") - raise TypeError return value @@ -166,10 +161,13 @@ def _is_all_dates(self) -> bool: @doc(Index.insert) def insert(self, loc: int, item): - try: - item = self._validate_fill_value(item) - except TypeError: - return self.astype(object).insert(loc, item) + # treat NA values as nans: + if is_scalar(item) and isna(item): + if is_valid_nat_for_dtype(item, self.dtype): + item = self._na_value + else: + # NaT, np.datetime64("NaT"), np.timedelta64("NaT") + return self.astype(object).insert(loc, item) return super().insert(loc, item) @@ -190,6 +188,18 @@ def _union(self, other, sort): else: return super()._union(other, sort) + def _cmp_method(self, other, op): + if self.is_(other): # fastpath + if op in {operator.eq, operator.le, operator.ge}: + arr = np.ones(len(self), dtype=bool) + if self._can_hold_na: + arr[self.isna()] = False + return arr + elif op in {operator.ne, operator.lt, operator.gt}: + return np.zeros(len(self), dtype=bool) + + return super()._cmp_method(other, op) + _num_index_shared_docs[ "class_descr" @@ -233,20 +243,6 @@ class IntegerIndex(NumericIndex): """ _default_dtype: np.dtype - _can_hold_na = False - - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented with matching signed-ness. - """ - if data.dtype.kind != cls._default_dtype.kind: - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _can_union_without_object_cast(self, other) -> bool: - # See GH#26778, further casting may occur in NumericIndex._union - return other.dtype == "f8" or other.dtype == self.dtype def __contains__(self, key) -> bool: """ @@ -270,11 +266,6 @@ def inferred_type(self) -> str: @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - warnings.warn( - "Index.asi8 is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) return self._values.view(self._default_dtype) @@ -282,9 +273,23 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args _typ = "int64index" + _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as ints. + """ + if not issubclass(data.dtype.type, np.signedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype + _uint64_descr_args = dict( klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" @@ -295,6 +300,7 @@ class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args _typ = "uint64index" + _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) @@ -313,6 +319,21 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) + # ---------------------------------------------------------------- + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as uints. + """ + if not issubclass(data.dtype.type, np.unsignedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype + _float64_descr_args = dict( klass="Float64Index", dtype="float64", ltype="float", extra="" @@ -324,7 +345,7 @@ class Float64Index(NumericIndex): _typ = "float64index" _engine_type = libindex.Float64Engine - _default_dtype = np.dtype(np.float64) + _default_dtype = np.float64 @property def inferred_type(self) -> str: @@ -403,6 +424,16 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans + @cache_readonly + def is_unique(self) -> bool: + return super().is_unique and self._nan_idxs.size < 2 + + @doc(Index.isin) + def isin(self, values, level=None): + if level is not None: + self._validate_index_level(level) + return algorithms.isin(np.array(self), values) + def _can_union_without_object_cast(self, other) -> bool: # See GH#26778, further casting may occur in NumericIndex._union return is_numeric_dtype(other.dtype) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5dff07ee4c6dd..44c20ad0de848 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,13 +1,13 @@ from datetime import datetime, timedelta -from typing import Any, cast -import warnings +from typing import Any import numpy as np -from pandas._libs import index as libindex, lib +from pandas._libs import index as libindex +from pandas._libs.lib import no_default from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -65,7 +65,7 @@ def _new_PeriodIndex(cls, **d): wrap=True, ) @inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) -class PeriodIndex(DatetimeIndexOpsMixin): +class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in time. @@ -146,7 +146,6 @@ class PeriodIndex(DatetimeIndexOpsMixin): _data: PeriodArray freq: BaseOffset - _data_cls = PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -245,12 +244,49 @@ def __new__( return cls._simple_new(data, name=name) + @classmethod + def _simple_new(cls, values: PeriodArray, name: Label = None): + """ + Create a new PeriodIndex. + + Parameters + ---------- + values : PeriodArray + Values that can be converted to a PeriodArray without inference + or coercion. + """ + assert isinstance(values, PeriodArray), type(values) + + result = object.__new__(cls) + result._data = values + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + result.name = name + result._cache = {} + result._reset_identity() + return result + # ------------------------------------------------------------------------ # Data @property def values(self) -> np.ndarray: - return np.asarray(self, dtype=object) + return np.asarray(self) + + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + + def _shallow_copy(self, values=None, name: Label = no_default): + name = name if name is not no_default else self.name + + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache + return result def _maybe_convert_timedelta(self, other): """ @@ -303,6 +339,10 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object)._values + @property + def _formatter_func(self): + return self.array._formatter(boxed=False) + # ------------------------------------------------------------------------ # Indexing @@ -377,26 +417,15 @@ def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: return super().asof_locs(where, mask) @doc(Index.astype) - def astype(self, dtype, copy: bool = True, how=lib.no_default): + def astype(self, dtype, copy: bool = True, how="start"): dtype = pandas_dtype(dtype) - if how is not lib.no_default: - # GH#37982 - warnings.warn( - "The 'how' keyword in PeriodIndex.astype is deprecated and " - "will be removed in a future version. " - "Use index.to_timestamp(how=how) instead", - FutureWarning, - stacklevel=2, - ) - else: - how = "start" - if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) + # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) @property @@ -436,7 +465,8 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ) # _assert_can_do_setop ensures we have matching dtype - result = super().join( + result = Int64Index.join( + self, other, how=how, level=level, @@ -578,9 +608,10 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like - raise self._invalid_indexer("slice", label) from err + # TODO: we need tests for this case + raise KeyError(label) from err elif is_integer(label) or is_float(label): - raise self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) return label @@ -663,10 +694,7 @@ def difference(self, other, sort=None): if self.equals(other): # pass an empty PeriodArray with the appropriate dtype - - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - values = cast(PeriodArray, self._data[:0]) - return type(self)._simple_new(values, name=self.name) + return type(self)._simple_new(self._data[:0], name=self.name) if is_object_dtype(other): return self.astype(object).difference(other).astype(self.dtype) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 669bf115df104..4b8207331838e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional, Tuple +from typing import Any, List import warnings import numpy as np @@ -29,7 +29,7 @@ from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name -from pandas.core.indexes.numeric import Float64Index, Int64Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer _empty_range = range(0) @@ -397,8 +397,6 @@ def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is not None: - if values.dtype.kind == "f": - return Float64Index(values, name=name) return Int64Index._simple_new(values, name=name) result = self._simple_new(self._range, name=name) @@ -461,16 +459,6 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) - def factorize( - self, sort: bool = False, na_sentinel: Optional[int] = -1 - ) -> Tuple[np.ndarray, "RangeIndex"]: - codes = np.arange(len(self), dtype=np.intp) - uniques = self - if sort and self.step < 0: - codes = codes[::-1] - uniques = uniques[::-1] - return codes, uniques - def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. @@ -670,17 +658,13 @@ def difference(self, other, sort=None): if not isinstance(overlap, RangeIndex): # We wont end up with RangeIndex, so fall back return super().difference(other, sort=sort) - if overlap.step != first.step: - # In some cases we might be able to get a RangeIndex back, - # but not worth the effort. - return super().difference(other, sort=sort) if overlap[0] == first.start: # The difference is everything after the intersection new_rng = range(overlap[-1] + first.step, first.stop, first.step) - elif overlap[-1] == first[-1]: + elif overlap[-1] == first.stop: # The difference is everything before the intersection - new_rng = range(first.start, overlap[0], first.step) + new_rng = range(first.start, overlap[0] - first.step, first.step) else: # The difference is not range-like return super().difference(other, sort=sort) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index fcab3e1f6a0a4..cf5fa4bbb3d75 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,7 +2,7 @@ from pandas._libs import index as libindex, lib from pandas._libs.tslibs import Timedelta, to_offset -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import doc @@ -103,7 +103,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _typ = "timedeltaindex" - _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine _comparables = ["name", "freq"] @@ -157,6 +156,29 @@ def __new__( ) return cls._simple_new(tdarr, name=name) + @classmethod + def _simple_new(cls, values: TimedeltaArray, name: Label = None): + assert isinstance(values, TimedeltaArray) + + result = object.__new__(cls) + result._data = values + result._name = name + result._cache = {} + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + + result._reset_identity() + return result + + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import get_format_timedelta64 + + return get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- @doc(Index.astype) @@ -223,12 +245,15 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): else: return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") elif not isinstance(label, self._data._recognized_scalars): - raise self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) return label # ------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "timedelta" + @property def inferred_type(self) -> str: return "timedelta64" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6aa031af64833..c5e331a104726 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -650,9 +650,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if self.ndim != 2: return - if isinstance(key, tuple) and not isinstance(self.obj.index, ABCMultiIndex): + if isinstance(key, tuple): # key may be a tuple if we are .loc - # if index is not a MultiIndex, set key to column part + # in that case, set key to the column part of key key = key[column_axis] axis = column_axis @@ -667,9 +667,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if k not in self.obj: if value is None: self.obj[k] = np.nan - elif is_array_like(value) and value.ndim == 2: - # GH#37964 have to select columnwise in case of array - self.obj[k] = value[:, i] elif is_list_like(value): self.obj[k] = value[i] else: @@ -684,7 +681,7 @@ def __setitem__(self, key, value): self._has_valid_setitem_indexer(key) iloc = self if self.name == "iloc" else self.obj.iloc - iloc._setitem_with_indexer(indexer, value, self.name) + iloc._setitem_with_indexer(indexer, value) def _validate_key(self, key, axis: int): """ @@ -1021,7 +1018,7 @@ def _multi_take(self, tup: Tuple): def _getitem_iterable(self, key, axis: int): """ - Index current object with an iterable collection of keys. + Index current object with an an iterable collection of keys. Parameters ---------- @@ -1249,7 +1246,9 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - # _validate_read_indexer is a no-op if no -1s, so skip + self._validate_read_indexer( + keyarr, indexer, axis, raise_missing=raise_missing + ) return ax[indexer], indexer if ax._index_as_unique: @@ -1310,15 +1309,21 @@ def _validate_read_indexer( not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - not_found = key[missing_mask] - - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + # we skip the warning on Categorical + # as this check is actually done (check for + # non-missing values), but a bit later in the + # code, so we want to avoid warning & then + # just raising + if not ax.is_categorical(): + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) @@ -1520,7 +1525,7 @@ def _get_setitem_indexer(self, key): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value, name="iloc"): + def _setitem_with_indexer(self, indexer, value): """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -1596,7 +1601,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes ) - self._setitem_with_indexer(new_indexer, value, name) + self._setitem_with_indexer(new_indexer, value) return @@ -1627,11 +1632,11 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # align and set the values if take_split_path: # We have to operate column-wise - self._setitem_with_indexer_split_path(indexer, value, name) + self._setitem_with_indexer_split_path(indexer, value) else: - self._setitem_single_block(indexer, value, name) + self._setitem_single_block(indexer, value) - def _setitem_with_indexer_split_path(self, indexer, value, name: str): + def _setitem_with_indexer_split_path(self, indexer, value): """ Setitem column-wise. """ @@ -1642,82 +1647,81 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): indexer = _tuplify(self.ndim, indexer) if len(indexer) > self.ndim: raise IndexError("too many indices for array") - if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: - raise ValueError(r"Cannot set values with ndim > 2") - if isinstance(value, ABCSeries) and name != "iloc": + if isinstance(value, ABCSeries): value = self._align_series(indexer, value) # Ensure we have something we can iterate over - info_axis = indexer[1] - ilocs = self._ensure_iterable_column_indexer(info_axis) + ilocs = self._ensure_iterable_column_indexer(indexer[1]) - pi = indexer[0] - lplane_indexer = length_of_indexer(pi, self.obj.index) + plane_indexer = indexer[:1] + lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) # lplane_indexer gives the expected length of obj[indexer[0]] + if len(ilocs) == 1: + # We can operate on a single column + + # require that we are setting the right number of values that + # we are indexing + if is_list_like_indexer(value) and 0 != lplane_indexer != len(value): + # Exclude zero-len for e.g. boolean masking that is all-false + raise ValueError( + "cannot set using a multi-index " + "selection indexer with a different " + "length than the value" + ) + # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: + # we have an equal len Frame if isinstance(value, ABCDataFrame): - self._setitem_with_indexer_frame_value(indexer, value, name) + self._setitem_with_indexer_frame_value(indexer, value) + # we have an equal len ndarray/convertible to our ilocs + # hasattr first, to avoid coercing to ndarray without reason. + # But we may be relying on the ndarray coercion to check ndim. + # Why not just convert to an ndarray earlier on if needed? elif np.ndim(value) == 2: self._setitem_with_indexer_2d_value(indexer, value) - elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): - # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) - - elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): - # We are trying to set N values into M entries of a single - # column, which is invalid for N != M - # Exclude zero-len for e.g. boolean masking that is all-false - - if len(value) == 1 and not is_integer(info_axis): - # This is a case like df.iloc[:3, [1]] = [0] - # where we treat as df.iloc[:3, 1] = 0 - return self._setitem_with_indexer((pi, info_axis[0]), value[0]) - - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) + elif ( + len(ilocs) == 1 + and lplane_indexer == len(value) + and not is_scalar(plane_indexer[0]) + ): + # we have an equal len list/ndarray + # We only get here with len(ilocs) == 1 + self._setitem_single_column(ilocs[0], value, plane_indexer) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask pass - elif len(ilocs) == len(value): - # We are setting multiple columns in a single row. - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) - - elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: - # This is a setitem-with-expansion, see - # test_loc_setitem_empty_append_expands_rows_mixed_dtype - # e.g. df = DataFrame(columns=["x", "y"]) - # df["x"] = df["x"].astype(np.int64) - # df.loc[:, "x"] = [1, 2, 3] - self._setitem_single_column(ilocs[0], value, pi) - else: - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) + # per-label values + if len(ilocs) != len(value): + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + for loc, v in zip(ilocs, value): + self._setitem_single_column(loc, v, plane_indexer) else: + if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: + raise ValueError(r"Cannot set values with ndim > 2") + # scalar value for loc in ilocs: - self._setitem_single_column(loc, value, pi) + self._setitem_single_column(loc, value, plane_indexer) def _setitem_with_indexer_2d_value(self, indexer, value): # We get here with np.ndim(value) == 2, excluding DataFrame, # which goes through _setitem_with_indexer_frame_value - pi = indexer[0] + plane_indexer = indexer[:1] ilocs = self._ensure_iterable_column_indexer(indexer[1]) @@ -1730,25 +1734,19 @@ def _setitem_with_indexer_2d_value(self, indexer, value): for i, loc in enumerate(ilocs): # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), pi) + self._setitem_single_column(loc, value[:, i].tolist(), plane_indexer) - def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: str): + def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame"): ilocs = self._ensure_iterable_column_indexer(indexer[1]) sub_indexer = list(indexer) - pi = indexer[0] + plane_indexer = indexer[:1] multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) unique_cols = value.columns.is_unique - # We do not want to align the value in case of iloc GH#37728 - if name == "iloc": - for i, loc in enumerate(ilocs): - val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) - - elif not unique_cols and value.columns.equals(self.obj.columns): + if not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see # test_iloc_setitem_frame_duplicate_columns_multiple_blocks for loc in ilocs: @@ -1763,7 +1761,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: s else: val = np.nan - self._setitem_single_column(loc, val, pi) + self._setitem_single_column(loc, val, plane_indexer) elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") @@ -1779,18 +1777,10 @@ def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: s else: val = np.nan - self._setitem_single_column(loc, val, pi) + self._setitem_single_column(loc, val, plane_indexer) def _setitem_single_column(self, loc: int, value, plane_indexer): - """ - - Parameters - ---------- - loc : int - Indexer for column position - plane_indexer : int, slice, listlike[int] - The indexer we use for setitem along axis=0. - """ + # positional setting on column loc pi = plane_indexer ser = self.obj._ixs(loc, axis=1) @@ -1800,18 +1790,21 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # which means essentially reassign to the columns of a # multi-dim object # GH#6149 (null slice), GH#10408 (full bounds) - if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): ser = value else: # set the item, possibly having a dtype change ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) + ser._mgr = ser._mgr.setitem(indexer=pi, value=value) ser._maybe_update_cacher(clear=True) # reset the sliced object if unique self.obj._iset_item(loc, ser) - def _setitem_single_block(self, indexer, value, name: str): + def _setitem_single_block(self, indexer, value): """ _setitem_with_indexer for the case when we have a single Block. """ @@ -1839,13 +1832,14 @@ def _setitem_single_block(self, indexer, value, name: str): return indexer = maybe_convert_ix(*indexer) - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + + if isinstance(value, (ABCSeries, dict)): # TODO(EA): ExtensionBlock.setitem this causes issues with # setting for extensionarrays that store dicts. Need to decide # if it's worth supporting that. value = self._align_series(indexer, Series(value)) - elif isinstance(value, ABCDataFrame) and name != "iloc": + elif isinstance(value, ABCDataFrame): value = self._align_frame(indexer, value) # check for chained assignment @@ -1877,8 +1871,7 @@ def _setitem_with_indexer_missing(self, indexer, value): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - # We get only here with loc, so can hard code - return self._setitem_with_indexer(new_indexer, value, "loc") + return self._setitem_with_indexer(new_indexer, value) # this preserves dtype of the value new_values = Series([value])._values @@ -1949,7 +1942,7 @@ def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False to the locations selected by `indexer` """ if isinstance(indexer, (slice, np.ndarray, list, Index)): - indexer = (indexer,) + indexer = tuple([indexer]) if isinstance(indexer, tuple): @@ -2022,7 +2015,7 @@ def ravel(i): raise ValueError("Incompatible indexer with Series") - def _align_frame(self, indexer, df: "DataFrame"): + def _align_frame(self, indexer, df: ABCDataFrame): is_frame = self.ndim == 2 if isinstance(indexer, tuple): @@ -2088,7 +2081,7 @@ def __getitem__(self, key): # we could have a convertible item here (e.g. Timestamp) if not is_list_like_indexer(key): - key = (key,) + key = tuple([key]) else: raise ValueError("Invalid call for scalar access (getting)!") @@ -2214,10 +2207,9 @@ def convert_to_index_sliceable(obj: "DataFrame", key): try: res = idx._get_string_slice(key) warnings.warn( - "Indexing a DataFrame with a datetimelike index using a single " - "string to slice the rows, like `frame[string]`, is deprecated " - "and will be removed in a future version. Use `frame.loc[string]` " - "instead.", + "Indexing on datetimelike rows with `frame[string]` is " + "deprecated and will be removed in a future version. " + "Use `frame.loc[string]` instead.", FutureWarning, stacklevel=3, ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 74b5a184df95d..ed77a210b6913 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -32,7 +32,6 @@ TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -124,16 +123,7 @@ def _simple_new( obj._mgr_locs = placement return obj - def __init__(self, values, placement, ndim: int): - """ - Parameters - ---------- - values : np.ndarray or ExtensionArray - placement : BlockPlacement (or castable) - ndim : int - 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame - """ - # TODO(EA2D): ndim will be unnecessary with 2D EAs + def __init__(self, values, placement, ndim=None): self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = self._maybe_coerce_values(values) @@ -427,7 +417,6 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) - mask = _extract_bool_array(mask) if limit is not None: limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False @@ -439,10 +428,9 @@ def fillna( return [self.copy()] if self._can_hold_element(value): - nb = self if inplace else self.copy() - nb._putmask_simple(mask, value) - # TODO: should be nb._maybe_downcast? - return self._maybe_downcast([nb], downcast) + # equivalent: _try_coerce_args(value) would not raise + blocks = self.putmask(mask, value, inplace=inplace) + return self._maybe_downcast(blocks, downcast) # we can't process the value, but nothing to do if not mask.any(): @@ -460,23 +448,7 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) - def _split(self) -> List["Block"]: - """ - Split a block into a list of single-column blocks. - """ - assert self.ndim == 2 - - new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): - vals = self.values[slice(i, i + 1)] - - nb = self.make_block(vals, [ref_loc]) - new_blocks.append(nb) - return new_blocks - - def split_and_operate( - self, mask, f, inplace: bool, ignore_failures: bool = False - ) -> List["Block"]: + def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -486,8 +458,7 @@ def split_and_operate( ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : bool - ignore_failures : bool, default False + inplace : boolean Returns ------- @@ -526,16 +497,8 @@ def make_a_block(nv, ref_loc): v = new_values[i] # need a new block - if m.any() or m.size == 0: - # Apply our function; we may ignore_failures if this is a - # reduction that is dropping nuisance columns GH#37827 - try: - nv = f(m, v, i) - except TypeError: - if ignore_failures: - continue - else: - raise + if m.any(): + nv = f(m, v, i) else: nv = v if inplace else v.copy() @@ -772,12 +735,40 @@ def replace( inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace + # If we cannot replace with own dtype, convert to ObjectBlock and + # retry if not self._can_hold_element(to_replace): - # We cannot hold `to_replace`, so we know immediately that - # replacing it is a no-op. - # Note: If to_replace were a list, NDFrame.replace would call - # replace_list instead of replace. - return [self] if inplace else [self.copy()] + if not isinstance(to_replace, list): + if inplace: + return [self] + return [self.copy()] + + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace(to_replace[0], value, inplace=inplace, regex=regex) + + # GH 22083, TypeError or ValueError occurred within error handling + # causes infinite loop. Cast and retry only if not objectblock. + if is_object_dtype(self): + raise AssertionError + + # try again with a compatible block + block = self.astype(object) + return block.replace( + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + ) values = self.values if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): @@ -787,23 +778,39 @@ def replace( to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) - if not mask.any(): - # Note: we get here with test_replace_extension_other incorrectly - # bc _can_hold_element is incorrect. - return [self] if inplace else [self.copy()] - if not self._can_hold_element(value): - blk = self.astype(object) - return blk.replace( + try: + blocks = self.putmask(mask, value, inplace=inplace) + # Note: it is _not_ the case that self._can_hold_element(value) + # is always true at this point. In particular, that can fail + # for: + # "2u" with bool-dtype, float-dtype + # 0.5 with int64-dtype + # np.nan with int64-dtype + except (TypeError, ValueError): + # GH 22083, TypeError or ValueError occurred within error handling + # causes infinite loop. Cast and retry only if not objectblock. + if is_object_dtype(self): + raise + + if not self.is_extension: + # TODO: https://github.com/pandas-dev/pandas/issues/32586 + # Need an ExtensionArray._can_hold_element to indicate whether + # a scalar value can be placed in the array. + assert not self._can_hold_element(value), value + + # try again with a compatible block + block = self.astype(object) + return block.replace( to_replace=original_to_replace, value=value, - inplace=True, + inplace=inplace, regex=regex, ) - blk = self if inplace else self.copy() - blk._putmask_simple(mask, value) - blocks = blk.convert(numeric=False, copy=not inplace) + blocks = extend_blocks( + [b.convert(numeric=False, copy=not inplace) for b in blocks] + ) return blocks def _replace_regex( @@ -861,15 +868,7 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ - # Exclude anything that we know we won't contain - pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) - ] - if not len(pairs): - # shortcut, nothing to replace - return [self] if inplace else [self.copy()] - - src_len = len(pairs) - 1 + src_len = len(src_list) - 1 def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: """ @@ -882,19 +881,14 @@ def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: s = maybe_box_datetimelike(s) return compare_or_regex_search(self.values, s, regex, mask) - if self.is_object: - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(self.values) - masks = [comp(s[0], mask, regex) for s in pairs] - else: - # GH#38086 faster if we know we dont need to check for regex - masks = [missing.mask_missing(self.values, s[0]) for s in pairs] + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) - masks = [_extract_bool_array(x) for x in masks] + masks = [comp(s, mask, regex) for s in src_list] rb = [self if inplace else self.copy()] - for i, (src, dest) in enumerate(pairs): + for i, (src, dest) in enumerate(zip(src_list, dest_list)): new_rb: List["Block"] = [] for blk in rb: m = masks[i] @@ -1030,35 +1024,6 @@ def setitem(self, indexer, value): block = self.make_block(values) return block - def _putmask_simple(self, mask: np.ndarray, value: Any): - """ - Like putmask but - - a) we do not cast on failure - b) we do not handle repeating or truncating like numpy. - - Parameters - ---------- - mask : np.ndarray[bool] - We assume _extract_bool_array has already been called. - value : Any - We assume self._can_hold_element(value) - """ - values = self.values - - if lib.is_scalar(value) and isinstance(values, np.ndarray): - value = convert_scalar_for_putitemlike(value, values.dtype) - - if self.is_extension or (self.is_object and not lib.is_scalar(value)): - # GH#19266 using np.putmask gives unexpected results with listlike value - if is_list_like(value) and len(value) == len(values): - values[mask] = value[mask] - else: - values[mask] = value - else: - # GH#37833 np.putmask is more performant than __setitem__ - np.putmask(values, mask, value) - def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: @@ -1210,15 +1175,39 @@ def coerce_to_target_dtype(self, other): # don't coerce float/complex to int return self - elif self.is_datetime or is_datetime64_any_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) + elif ( + self.is_datetime + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): + + # not a datetime + if not ( + (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) + and self.is_datetime + ): + return self.astype(object) + + # don't upcast timezone with different timezone or no timezone + mytz = getattr(self.dtype, "tz", None) + othertz = getattr(dtype, "tz", None) + + if not tz_compare(mytz, othertz): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) elif self.is_timedelta or is_timedelta64_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) + + # not a timedelta + if not (is_timedelta64_dtype(dtype) and self.is_timedelta): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) try: return self.astype(dtype) @@ -1262,6 +1251,7 @@ def interpolate( axis=axis, inplace=inplace, limit=limit, + coerce=coerce, downcast=downcast, ) # validate the interp method @@ -1288,12 +1278,20 @@ def _interpolate_with_fill( axis: int = 0, inplace: bool = False, limit: Optional[int] = None, + coerce: bool = False, downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") - assert self._can_hold_na # checked by caller + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] values = self.values if inplace else self.values.copy() @@ -1449,27 +1447,39 @@ def where( if values.ndim - 1 == other.ndim and axis == 1: other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: - # TODO(EA2D): not neceesssary with 2D EAs cond = cond.T if not hasattr(cond, "shape"): raise ValueError("where must have a condition that is ndarray like") + def where_func(cond, values, other): + + if not ( + (self.is_integer or self.is_bool) + and lib.is_float(other) + and np.isnan(other) + ): + # np.where will cast integer array to floats in this case + if not self._can_hold_element(other): + raise TypeError + if lib.is_scalar(other) and isinstance(values, np.ndarray): + # convert datetime to datetime64, timedelta to timedelta64 + other = convert_scalar_for_putitemlike(other, values.dtype) + + # By the time we get here, we should have all Series/Index + # args extracted to ndarray + fastres = expressions.where(cond, values, other) + return fastres + if cond.ravel("K").all(): result = values else: # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) - if ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # GH#3733 special case to avoid object-dtype casting - # and go through numexpr path instead. - # In integer case, np.where will cast to floats - pass - elif not self._can_hold_element(other): + try: + result = where_func(cond, values, other) + except TypeError: + # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) @@ -1478,18 +1488,6 @@ def where( ) return self._maybe_downcast(blocks, "infer") - if not ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar_for_putitemlike(other, values.dtype) - - # By the time we get here, we should have all Series/Index - # args extracted to ndarray - result = expressions.where(cond, values, other) - if self._can_hold_na or self.ndim == 1: if transpose: @@ -1630,11 +1628,8 @@ def _replace_coerce( """ if mask.any(): if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - nb._putmask_simple(mask, value) - return [nb] + self = self.coerce_to_target_dtype(value) + return self.putmask(mask, value, inplace=inplace) else: regex = _should_use_regex(regex, to_replace) if regex: @@ -1667,7 +1662,7 @@ class ExtensionBlock(Block): values: ExtensionArray - def __init__(self, values, placement, ndim: int): + def __init__(self, values, placement, ndim=None): """ Initialize a non-consolidatable block. @@ -2044,16 +2039,6 @@ class ObjectValuesExtensionBlock(ExtensionBlock): def external_values(self): return self.values.astype(object) - def _can_hold_element(self, element: Any) -> bool: - if is_valid_nat_for_dtype(element, self.dtype): - return True - if isinstance(element, list) and len(element) == 0: - return True - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, self.dtype.type) - return isinstance(element, self.dtype.type) - class NumericBlock(Block): __slots__ = () @@ -2193,9 +2178,7 @@ def diff(self, n: int, axis: int = 0) -> List["Block"]: values = self.array_values().reshape(self.shape) new_values = values - values.shift(n, axis=axis) - return [ - TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) - ] + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs @@ -2428,8 +2411,9 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): - # TODO(EA2D): if we operated on array_values, TDA.fillna would handle - # raising here. + + # allow filling with integers to be + # interpreted as nanoseconds if is_integer(value): # Deprecation GH#24694, GH#19233 raise TypeError( @@ -2484,9 +2468,7 @@ def mask_func(mask, values, inplace): values = values.reshape(1, -1) return func(values) - return self.split_and_operate( - None, mask_func, False, ignore_failures=ignore_failures - ) + return self.split_and_operate(None, mask_func, False) try: res = func(values) @@ -2592,7 +2574,7 @@ def _replace_list( regex: bool = False, ) -> List["Block"]: if len(algos.unique(dest_list)) == 1: - # We likely got here by tiling value inside NDFrame.replace, + # We got likely here by tiling value inside NDFrame.replace, # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) return super()._replace_list(src_list, dest_list, inplace, regex) @@ -2646,7 +2628,6 @@ def get_block_type(values, dtype=None): elif is_interval_dtype(dtype) or is_period_dtype(dtype): cls = ObjectValuesExtensionBlock elif is_extension_array_dtype(values.dtype): - # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 06de1972b4c9a..8efba87b14ce5 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -82,7 +82,6 @@ def concatenate_block_managers( b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, - ndim=len(axes), ) blocks.append(b) @@ -228,7 +227,7 @@ def is_na(self) -> bool: return isna_all(values_flat) - def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na): + def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value @@ -249,8 +248,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na): empty_dtype ): if self.block is None: + array = empty_dtype.construct_array_type() # TODO(EA2D): special case unneeded with 2D EAs - return DatetimeArray( + return array( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) elif getattr(self.block, "is_categorical", False): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index eefd1a604f894..bcafa2c2fdca7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -225,8 +225,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # TODO: What about re-joining object columns? block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) - for n in range(len(dvals_list)) + make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) ] else: @@ -370,7 +369,7 @@ def extract_index(data) -> Index: index = Index([]) elif len(data) > 0: raw_lengths = [] - indexes: List[Union[List[Label], Index]] = [] + indexes = [] have_raw_arrays = False have_series = False diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4cd7cc56144d9..767c653f8a404 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -33,7 +33,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos @@ -442,7 +442,6 @@ def apply( def quantile( self, axis: int = 0, - consolidate: bool = True, transposed: bool = False, interpolation="linear", qs=None, @@ -456,8 +455,6 @@ def quantile( Parameters ---------- axis: reduction axis, default 0 - consolidate: bool, default True. Join together blocks having same - dtype transposed: bool, default False we are holding transposed data interpolation : type of interpolation, default 'linear' @@ -472,9 +469,6 @@ def quantile( # simplify some of the code here and in the blocks assert self.ndim >= 2 - if consolidate: - self._consolidate_inplace() - def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 @@ -719,28 +713,13 @@ def is_view(self) -> bool: def get_bool_data(self, copy: bool = False) -> "BlockManager": """ - Select blocks that are bool-dtype and columns from object-dtype blocks - that are all-bool. - Parameters ---------- copy : bool, default False Whether to copy the blocks """ - - new_blocks = [] - - for blk in self.blocks: - if blk.dtype == bool: - new_blocks.append(blk) - - elif blk.is_object: - nbs = blk._split() - for nb in nbs: - if nb.is_bool: - new_blocks.append(nb) - - return self._combine(new_blocks, copy) + self._consolidate_inplace() + return self._combine([b for b in self.blocks if b.is_bool], copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ @@ -1438,7 +1417,7 @@ def _make_na_block(self, placement, fill_value=None): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement, ndim=block_values.ndim) + return make_block(block_values, placement=placement) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -1550,7 +1529,7 @@ def __init__( ) self.axes = [axis] - self.blocks = (block,) + self.blocks = tuple([block]) @classmethod def from_blocks( @@ -1661,9 +1640,7 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block( - values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 - ) + make_block(values=blocks[0], placement=slice(0, len(axes[0]))) ] mgr = BlockManager(blocks, axes) @@ -1683,11 +1660,8 @@ def create_block_manager_from_arrays( assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) - # ensure we dont have any PandasArrays when we call get_block_type - # Note: just calling extract_array breaks tests that patch PandasArray._typ. - arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: - blocks = _form_blocks(arrays, names, axes) + blocks = form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr @@ -1719,7 +1693,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def _form_blocks(arrays, names: Index, axes) -> List[Block]: +def form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1766,7 +1740,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) + make_block(array, klass=DatetimeTZBlock, placement=i) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1781,14 +1755,15 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i, ndim=2) + make_block(array, klass=CategoricalBlock, placement=i) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): + external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i, ndim=2) + make_block(array, klass=ExtensionBlock, placement=i) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1796,7 +1771,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1809,7 +1784,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs, ndim=2) + na_block = make_block(block_values, placement=extra_locs) blocks.append(na_block) return blocks @@ -1826,7 +1801,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement, ndim=2) + block = make_block(values, placement=placement) return [block] @@ -1840,7 +1815,7 @@ def _multi_blockify(tuples, dtype=None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement, ndim=2) + block = make_block(values, placement=placement) new_blocks.append(block) return new_blocks @@ -1931,7 +1906,7 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs, ndim=2)] + return [make_block(new_values, placement=new_mgr_locs)] # can't consolidate --> no merge return blocks diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 80c4cd5b44a92..d38974839394d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1646,7 +1646,7 @@ def nanpercentile( interpolation=interpolation, ) - # Note: we have to do `astype` and not view because in general we + # Note: we have to do do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c855687552e82..8142fc3e695a3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -27,7 +27,7 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core.ops import missing @@ -40,11 +40,13 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)): + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + # Note: these checks can be for ABCIndex and not ABCIndexClass + # because that is the only object-dtype class. if not is_object_dtype(y.dtype): y = y.astype(np.object_) - if isinstance(y, (ABCSeries, ABCIndexClass)): + if isinstance(y, (ABCSeries, ABCIndex)): y = y._values if x.shape != y.shape: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a2f25bbcf38d3..53e565a966769 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs): # downsample methods -for method in ["sum", "prod", "min", "max", "first", "last"]: +for method in ["sum", "prod"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs): # downsample methods -for method in ["mean", "sem", "median", "ohlc"]: +for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4a2629daf63d7..77b1076920f20 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,27 +3,16 @@ """ from collections import abc -from typing import ( - TYPE_CHECKING, - Iterable, - List, - Mapping, - Optional, - Type, - Union, - cast, - overload, -) +from typing import TYPE_CHECKING, Iterable, List, Mapping, Type, Union, cast, overload import numpy as np -from pandas._typing import FrameOrSeriesUnion, Label +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -306,7 +295,7 @@ class _Concatenator: def __init__( self, - objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join: str = "outer", keys=None, @@ -377,7 +366,7 @@ def __init__( # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample: Optional["NDFrame"] = None + sample = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -447,8 +436,6 @@ def __init__( # to line up if self._is_frame and axis == 1: name = 0 - # mypy needs to know sample is not an NDFrame - sample = cast("FrameOrSeriesUnion", sample) obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -514,13 +501,6 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - # We have to remove the duplicates from obj_labels - # in new labels to make them unique, otherwise we would - # duplicate or duplicates again - if not obj_labels.is_unique: - new_labels = algos.make_duplicates_of_left_unique_in_right( - np.asarray(obj_labels), np.asarray(new_labels) - ) indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3b755c40721fb..dd45a00155721 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -5,9 +5,8 @@ import copy import datetime from functools import partial -import hashlib import string -from typing import TYPE_CHECKING, Optional, Tuple, cast +from typing import TYPE_CHECKING, Optional, Tuple import warnings import numpy as np @@ -51,7 +50,6 @@ if TYPE_CHECKING: from pandas import DataFrame - from pandas.core.arrays import DatetimeArray @Substitution("\nleft : DataFrame") @@ -644,17 +642,6 @@ def __init__( self._validate_specification() - cross_col = None - if self.how == "cross": - ( - self.left, - self.right, - self.how, - cross_col, - ) = self._create_cross_configuration(self.left, self.right) - self.left_on = self.right_on = [cross_col] - self._cross = cross_col - # note this function has side effects ( self.left_join_keys, @@ -702,14 +689,8 @@ def get_result(self): self._maybe_restore_index_levels(result) - self._maybe_drop_cross_column(result, self._cross) - return result.__finalize__(self, method="merge") - def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): - if cross_col is not None: - result.drop(columns=cross_col, inplace=True) - def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" ) -> Tuple["DataFrame", "DataFrame"]: @@ -1218,50 +1199,9 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) - def _create_cross_configuration( - self, left, right - ) -> Tuple["DataFrame", "DataFrame", str, str]: - """ - Creates the configuration to dispatch the cross operation to inner join, - e.g. adding a join column and resetting parameters. Join column is added - to a new object, no inplace modification - - Parameters - ---------- - left: DataFrame - right DataFrame - - Returns - ------- - a tuple (left, right, how, cross_col) representing the adjusted - DataFrames with cross_col, the merge operation set to inner and the column - to join over. - """ - cross_col = f"_cross_{hashlib.md5().hexdigest()}" - how = "inner" - return ( - left.assign(**{cross_col: 1}), - right.assign(**{cross_col: 1}), - how, - cross_col, - ) - def _validate_specification(self): - if self.how == "cross": - if ( - self.left_index - or self.right_index - or self.right_on is not None - or self.left_on is not None - or self.on is not None - ): - raise MergeError( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" - ) - return # Hm, any way to make this logic less complicated?? - elif self.on is None and self.left_on is None and self.right_on is None: + if self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () @@ -1325,7 +1265,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if self.how != "cross" and len(self.right_on) != len(self.left_on): + if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): @@ -1417,14 +1357,12 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how in ("left", "right"): + if how == "left": kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, - "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( - y, x, count, **kwargs - )[::-1], + "right": _right_outer_join, "outer": libjoin.full_outer_join, }[how] @@ -1944,6 +1882,11 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + + def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" ) -> Tuple[np.ndarray, np.ndarray, int]: @@ -2004,8 +1947,8 @@ def _factorize_keys( if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - lk = cast("DatetimeArray", lk)._ndarray - rk = cast("DatetimeArray", rk)._ndarray + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif ( is_categorical_dtype(lk.dtype) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c1198cdfcda81..8fae01cb30d3d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -450,9 +450,10 @@ def pivot( cols = com.convert_to_list_like(index) else: cols = [] + cols.extend(columns) append = index is None - indexed = data.set_index(cols + columns, append=append) + indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c197e142fecbc..18ebe14763797 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -137,7 +137,7 @@ def _indexer_and_to_sort(self): @cache_readonly def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort - return [line.take(indexer) for line in to_sort] + return [l.take(indexer) for l in to_sort] def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort @@ -399,7 +399,6 @@ def _unstack_multiple(data, clocs, fill_value=None): def unstack(obj, level, fill_value=None): - if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, @@ -417,13 +416,6 @@ def unstack(obj, level, fill_value=None): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) - elif not isinstance(obj.index, MultiIndex): - # GH 36113 - # Give nicer error messages when unstack a Series whose - # Index is not a MultiIndex. - raise ValueError( - f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" - ) else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) @@ -521,7 +513,7 @@ def factorize(index): verify_integrity=False, ) - if not frame.empty and frame._is_homogeneous_type: + if frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) diff --git a/pandas/core/series.py b/pandas/core/series.py index d493ac0a8c051..f243771ff97a5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -176,7 +176,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" - _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] @@ -368,7 +367,7 @@ def _init_dict(self, data, index=None, dtype=None): values = na_value_for_dtype(dtype) keys = index else: - keys, values = tuple(), [] + keys, values = tuple([]), [] # Input is now list-like, so rely on "standard" construction: @@ -684,6 +683,81 @@ def view(self, dtype=None) -> "Series": # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + # TODO: handle DataFrame + cls = type(self) + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + types = tuple(type(x) for x in inputs) + # TODO: dataframe + alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + index = alignable[0].index + for s in alignable[1:]: + index = index.union(s.index) + inputs = tuple( + x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types) + ) + else: + index = self.index + + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + + name = names[0] if len(set(names)) == 1 else None + + def construct_return(result): + if lib.is_scalar(result): + return result + elif result.ndim > 1: + # e.g. np.subtract.outer + if method == "outer": + # GH#27198 + raise NotImplementedError + return result + return self._constructor(result, index=index, name=name, copy=False) + + if type(result) is tuple: + # multiple return values + return tuple(construct_return(x) for x in result) + elif method == "at": + # no return value + return None + else: + return construct_return(result) + def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -826,7 +900,7 @@ def __getitem__(self, key): return result - except (KeyError, TypeError): + except KeyError: if isinstance(key, tuple) and isinstance(self.index, MultiIndex): # We still have the corner case where a tuple is a key # in the first level of our MultiIndex @@ -890,7 +964,7 @@ def _get_values_tuple(self, key): return result if not isinstance(self.index, MultiIndex): - raise KeyError("key of type tuple not found and not a MultiIndex") + raise ValueError("key of type tuple not found and not a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) @@ -941,12 +1015,12 @@ def __setitem__(self, key, value): # positional setter values[key] = value else: - # GH#12862 adding a new key to the Series + # GH#12862 adding an new key to the Series self.loc[key] = value except TypeError as err: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - raise KeyError( + raise ValueError( "key of type tuple not found and not a MultiIndex" ) from err @@ -1354,7 +1428,6 @@ def to_string( @doc( klass=_shared_doc_kwargs["klass"], - storage_options=generic._shared_docs["storage_options"], examples=dedent( """ Examples @@ -1393,7 +1466,14 @@ def to_markdown( Add index (row) labels. .. versionadded:: 1.1.0 - {storage_options} + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -4617,7 +4697,7 @@ def isin(self, values) -> "Series": 5 False Name: animal, dtype: bool """ - result = algorithms.isin(self._values, values) + result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 9de9d1f434a12..cc918c27b5c2e 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -324,67 +324,4 @@ 0 0.000000 1.000000 1 1.000000 2.718282 2 1.414214 7.389056 - -You can call transform on a GroupBy object: - ->>> df = pd.DataFrame({{ -... "Date": [ -... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05", -... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"], -... "Data": [5, 8, 6, 1, 50, 100, 60, 120], -... }}) ->>> df - Date Data -0 2015-05-08 5 -1 2015-05-07 8 -2 2015-05-06 6 -3 2015-05-05 1 -4 2015-05-08 50 -5 2015-05-07 100 -6 2015-05-06 60 -7 2015-05-05 120 ->>> df.groupby('Date')['Data'].transform('sum') -0 55 -1 108 -2 66 -3 121 -4 55 -5 108 -6 66 -7 121 -Name: Data, dtype: int64 - ->>> df = pd.DataFrame({{ -... "c": [1, 1, 1, 2, 2, 2, 2], -... "type": ["m", "n", "o", "m", "m", "n", "n"] -... }}) ->>> df - c type -0 1 m -1 1 n -2 1 o -3 2 m -4 2 m -5 2 n -6 2 n ->>> df['size'] = df.groupby('c')['type'].transform(len) ->>> df - c type size -0 1 m 3 -1 1 n 3 -2 1 o 3 -3 2 m 4 -4 2 m 4 -5 2 n 4 -6 2 n 4 """ - -_shared_docs[ - "storage_options" -] = """storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a non-fsspec URL. - See the fsspec and backend storage implementation docs for the set of - allowed keys and values.""" diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 729f517c789a7..e390229b5dcba 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -31,7 +31,6 @@ if TYPE_CHECKING: from pandas import MultiIndex - from pandas.core.arrays import ExtensionArray from pandas.core.indexes.base import Index _INT64_MAX = np.iinfo(np.int64).max @@ -391,7 +390,7 @@ def nargsort( return indexer -def nargminmax(values: "ExtensionArray", method: str) -> int: +def nargminmax(values, method: str): """ Implementation of np.argmin/argmax but for ExtensionArray and which handles missing values. @@ -406,20 +405,16 @@ def nargminmax(values: "ExtensionArray", method: str) -> int: int """ assert method in {"argmax", "argmin"} + func = np.argmax if method == "argmax" else np.argmin - mask = np.asarray(values.isna()) - if mask.all(): - # Use same exception message we would get from numpy - raise ValueError(f"attempt to get {method} of an empty sequence") + mask = np.asarray(isna(values)) + values = values._values_for_argsort() - if method == "argmax": - # Use argsort with ascending=False so that if more than one entry - # achieves the maximum, we take the first such occurence. - sorters = values.argsort(ascending=False) - else: - sorters = values.argsort(ascending=True) + idx = np.arange(len(values)) + non_nans = values[~mask] + non_nan_idx = idx[~mask] - return sorters[0] + return non_nan_idx[func(non_nans)] def _ensure_key_mapped_multiindex( @@ -610,7 +605,7 @@ def compress_group_index(group_index, sort: bool = True): if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return ensure_int64(comp_ids), ensure_int64(obs_group_ids) + return comp_ids, obs_group_ids def _reorder_by_uniques(uniques, labels): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9d16beba669ca..7d6a2bf1d776d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -157,10 +157,11 @@ def __init__(self, data): array = data.array self._array = array - self._index = self._name = None if isinstance(data, ABCSeries): self._index = data.index self._name = data.name + else: + self._index = self._name = None # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 4af32b219d380..32ca83787c4c1 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -10,7 +10,6 @@ is_number, is_numeric_dtype, is_scalar, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -124,9 +123,8 @@ def to_numeric(arg, errors="raise", downcast=None): values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True - if needs_i8_conversion(arg.dtype): - values = arg.asi8 - else: + values = arg.asi8 + if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 6a9fd7a542a44..e8faebd6b2542 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -66,11 +66,6 @@ def to_timedelta(arg, unit=None, errors="raise"): to_datetime : Convert argument to datetime. convert_dtypes : Convert dtypes. - Notes - ----- - If the precision is higher than nanoseconds, the precision of the duration is - truncated to nanoseconds for string inputs. - Examples -------- Parsing a single string to a Timedelta: diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py index b3d0820fee4da..304c61ac0e489 100644 --- a/pandas/core/window/__init__.py +++ b/pandas/core/window/__init__.py @@ -1,6 +1,3 @@ -from pandas.core.window.ewm import ( # noqa:F401 - ExponentialMovingWindow, - ExponentialMovingWindowGroupby, -) +from pandas.core.window.ewm import ExponentialMovingWindow # noqa:F401 from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 6ebf610587d30..938f1846230cb 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,6 +1,5 @@ """Common utility functions for rolling operations""" from collections import defaultdict -from typing import cast import warnings import numpy as np @@ -110,9 +109,6 @@ def dataframe_from_int_dict(data, frame_template): # set the index and reorder if arg2.columns.nlevels > 1: - # mypy needs to know columns is a MultiIndex, Index doesn't - # have levels attribute - arg2.columns = cast(MultiIndex, arg2.columns) result.index = MultiIndex.from_product( arg2.columns.levels + [result_index] ) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index f8237a436f436..9f7040943d9a3 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -14,20 +14,8 @@ from pandas.core.dtypes.common import is_datetime64_ns_dtype import pandas.core.common as common -from pandas.core.util.numba_ import maybe_use_numba -from pandas.core.window.common import ( - _doc_template, - _shared_docs, - flex_binary_moment, - zsqrt, -) -from pandas.core.window.indexers import ( - BaseIndexer, - ExponentialMovingWindowIndexer, - GroupbyIndexer, -) -from pandas.core.window.numba_ import generate_numba_groupby_ewma_func -from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch +from pandas.core.window.common import _doc_template, _shared_docs, zsqrt +from pandas.core.window.rolling import BaseWindow, flex_binary_moment if TYPE_CHECKING: from pandas import Series @@ -184,7 +172,7 @@ class ExponentialMovingWindow(BaseWindow): ----- More details can be found at: - :ref:`Exponentially weighted windows `. + :ref:`Exponentially weighted windows `. Examples -------- @@ -231,16 +219,14 @@ def __init__( ignore_na: bool = False, axis: int = 0, times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, - **kwargs, ): + self.com: Optional[float] self.obj = obj self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis self.on = None - self.center = False - self.closed = None if times is not None: if isinstance(times, str): times = self._selected_obj[times] @@ -259,7 +245,7 @@ def __init__( if common.count_not_none(com, span, alpha) > 0: self.com = get_center_of_mass(com, span, None, alpha) else: - self.com = 0.0 + self.com = None else: if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): raise ValueError( @@ -274,12 +260,6 @@ def __init__( def _constructor(self): return ExponentialMovingWindow - def _get_window_indexer(self) -> BaseIndexer: - """ - Return an indexer class that will compute the window start and end bounds - """ - return ExponentialMovingWindowIndexer() - _agg_see_also_doc = dedent( """ See Also @@ -319,6 +299,27 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate + def _apply(self, func): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + func : str/callable to apply + + Returns + ------- + y : same type as input argument + """ + + def homogeneous_func(values: np.ndarray): + if values.size == 0: + return values.copy() + return np.apply_along_axis(func, self.axis, values) + + return self._apply_blockwise(homogeneous_func) + @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) def mean(self, *args, **kwargs): @@ -335,6 +336,7 @@ def mean(self, *args, **kwargs): window_func = self._get_roll_func("ewma_time") window_func = partial( window_func, + minp=self.min_periods, times=self.times, halflife=self.halflife, ) @@ -345,6 +347,7 @@ def mean(self, *args, **kwargs): com=self.com, adjust=self.adjust, ignore_na=self.ignore_na, + minp=self.min_periods, ) return self._apply(window_func) @@ -368,19 +371,13 @@ def var(self, bias: bool = False, *args, **kwargs): Exponential weighted moving variance. """ nv.validate_window_func("var", args, kwargs) - window_func = self._get_roll_func("ewmcov") - window_func = partial( - window_func, - com=self.com, - adjust=self.adjust, - ignore_na=self.ignore_na, - bias=bias, - ) - def var_func(values, begin, end, min_periods): - return window_func(values, begin, end, min_periods, values) + def f(arg): + return window_aggregations.ewmcov( + arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias + ) - return self._apply(var_func) + return self._apply(f) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) @@ -422,13 +419,11 @@ def _get_cov(X, Y): Y = self._shallow_copy(Y) cov = window_aggregations.ewmcov( X._prep_values(), - np.array([0], dtype=np.int64), - np.array([0], dtype=np.int64), - self.min_periods, Y._prep_values(), self.com, self.adjust, self.ignore_na, + self.min_periods, bias, ) return wrap_result(X, cov) @@ -475,15 +470,7 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, - np.array([0], dtype=np.int64), - np.array([0], dtype=np.int64), - self.min_periods, - y, - self.com, - self.adjust, - self.ignore_na, - 1, + x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1 ) x_values = X._prep_values() @@ -498,78 +485,3 @@ def _cov(x, y): return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) - - -class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): - """ - Provide an exponential moving window groupby implementation. - """ - - def _get_window_indexer(self) -> GroupbyIndexer: - """ - Return an indexer class that will compute the window start and end bounds - - Returns - ------- - GroupbyIndexer - """ - window_indexer = GroupbyIndexer( - groupby_indicies=self._groupby.indices, - window_indexer=ExponentialMovingWindowIndexer, - ) - return window_indexer - - var = dispatch("var", bias=False) - std = dispatch("std", bias=False) - cov = dispatch("cov", other=None, pairwise=None, bias=False) - corr = dispatch("corr", other=None, pairwise=None) - - def mean(self, engine=None, engine_kwargs=None): - """ - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs mean through C-extensions from cython. - * ``'numba'`` : Runs mean through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.2.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}``. - - .. versionadded:: 1.2.0 - - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - """ - if maybe_use_numba(engine): - groupby_ewma_func = generate_numba_groupby_ewma_func( - engine_kwargs, - self.com, - self.adjust, - self.ignore_na, - ) - return self._apply( - groupby_ewma_func, - numba_cache_key=(lambda x: x, "groupby_ewma"), - ) - elif engine in ("cython", None): - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return x.mean() - - return self._groupby.apply(f) - else: - raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index a3b9695d777d9..a8229257bb7bb 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -344,18 +344,3 @@ def get_window_bounds( start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) return start, end - - -class ExponentialMovingWindowIndexer(BaseIndexer): - """Calculate ewm window bounds (the entire window)""" - - @Appender(get_window_bounds_doc) - def get_window_bounds( - self, - num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - - return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 274586e1745b5..c4858b6e5a4ab 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -72,92 +72,3 @@ def roll_apply( return result return roll_apply - - -def generate_numba_groupby_ewma_func( - engine_kwargs: Optional[Dict[str, bool]], - com: float, - adjust: bool, - ignore_na: bool, -): - """ - Generate a numba jitted groupby ewma function specified by values - from engine_kwargs. - - Parameters - ---------- - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - com : float - adjust : bool - ignore_na : bool - - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - cache_key = (lambda x: x, "groupby_ewma") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba = import_optional_dependency("numba") - if parallel: - loop_range = numba.prange - else: - loop_range = range - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def groupby_ewma( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - ) -> np.ndarray: - result = np.empty(len(values)) - alpha = 1.0 / (1.0 + com) - for i in loop_range(len(begin)): - start = begin[i] - stop = end[i] - window = values[start:stop] - sub_result = np.empty(len(window)) - - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - - weighted_avg = window[0] - nobs = int(not np.isnan(weighted_avg)) - sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan - old_wt = 1.0 - - for j in range(1, len(window)): - cur = window[j] - is_observation = not np.isnan(cur) - nobs += is_observation - if not np.isnan(weighted_avg): - - if is_observation or not ignore_na: - - old_wt *= old_wt_factor - if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ( - (old_wt * weighted_avg) + (new_wt * cur) - ) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1.0 - elif is_observation: - weighted_avg = cur - - sub_result[j] = weighted_avg if nobs >= minimum_periods else np.nan - - result[start:stop] = sub_result - - return result - - return groupby_ewma diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 51a1e2102c273..5d561c84ab462 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -337,13 +337,6 @@ def _get_roll_func(self, func_name: str) -> Callable[..., Any]: ) return window_func - @property - def _index_array(self): - # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - return self._on.asi8 - return None - def _get_window_indexer(self) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds @@ -352,7 +345,7 @@ def _get_window_indexer(self) -> BaseIndexer: return self.window if self.is_freq_type: return VariableWindowIndexer( - index_array=self._index_array, window_size=self.window + index_array=self._on.asi8, window_size=self.window ) return FixedWindowIndexer(window_size=self.window) @@ -412,7 +405,7 @@ def _apply( self, func: Callable[..., Any], name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -424,8 +417,9 @@ def _apply( ---------- func : callable function to apply name : str, - numba_cache_key : tuple - caching key to be used to store a compiled numba func + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function @@ -462,8 +456,8 @@ def calc(x): result = calc(values) result = np.asarray(result) - if numba_cache_key is not None: - NUMBA_FUNC_CACHE[numba_cache_key] = func + if use_numba_cache: + NUMBA_FUNC_CACHE[(kwargs["original_func"], "rolling_apply")] = func return result @@ -721,7 +715,7 @@ def aggregate(self, func, *args, **kwargs): ) -def dispatch(name: str, *args, **kwargs): +def _dispatch(name: str, *args, **kwargs): """ Dispatch to groupby apply. """ @@ -752,20 +746,20 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - corr = dispatch("corr", other=None, pairwise=None) - cov = dispatch("cov", other=None, pairwise=None) + corr = _dispatch("corr", other=None, pairwise=None) + cov = _dispatch("cov", other=None, pairwise=None) def _apply( self, func: Callable[..., Any], name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, + use_numba_cache: bool = False, **kwargs, ) -> FrameOrSeries: result = super()._apply( func, name, - numba_cache_key, + use_numba_cache, **kwargs, ) # Reconstruct the resulting MultiIndex from tuples @@ -1044,7 +1038,7 @@ def _apply( self, func: Callable[[np.ndarray, int, int], np.ndarray], name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -1056,8 +1050,9 @@ def _apply( ---------- func : callable function to apply name : str, - use_numba_cache : tuple - unused + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for scipy windows if necessary @@ -1275,7 +1270,7 @@ def count(self): Notes ----- - See :ref:`window.numba_engine` for extended documentation and performance + See :ref:`stats.rolling_apply` for extended documentation and performance considerations for the Numba engine. """ ) @@ -1297,12 +1292,10 @@ def apply( if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") - numba_cache_key = None if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs) - numba_cache_key = (func, "rolling_apply") elif engine in ("cython", None): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") @@ -1312,7 +1305,10 @@ def apply( return self._apply( apply_func, - numba_cache_key=numba_cache_key, + use_numba_cache=maybe_use_numba(engine), + original_func=func, + args=args, + kwargs=kwargs, ) def _generate_cython_apply_func( @@ -2147,7 +2143,7 @@ def _get_window_indexer(self) -> GroupbyIndexer: """ rolling_indexer: Type[BaseIndexer] indexer_kwargs: Optional[Dict[str, Any]] = None - index_array = self._index_array + index_array = self._on.asi8 window = self.window if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8ec0a869c7042..910eb23d9a2d0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -8,7 +8,21 @@ import mmap import os import pathlib -from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast +from typing import ( + IO, + TYPE_CHECKING, + Any, + AnyStr, + Dict, + Generic, + List, + Mapping, + Optional, + Tuple, + Type, + Union, + cast, +) from urllib.parse import ( urljoin, urlparse as parse_url, @@ -23,8 +37,10 @@ Buffer, CompressionDict, CompressionOptions, + EncodingVar, FileOrBuffer, FilePathOrBuffer, + ModeVar, StorageOptions, ) from pandas.compat import get_lzma_file, import_lzma @@ -39,10 +55,16 @@ _VALID_URLS.discard("") +if TYPE_CHECKING: + from io import IOBase + + @dataclasses.dataclass -class IOArgs: +class IOArgs(Generic[ModeVar, EncodingVar]): """ - Return value of io/common.py:_get_filepath_or_buffer. + Return value of io/common.py:get_filepath_or_buffer. + + This is used to easily close created fsspec objects. Note (copy&past from io/parsers): filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] @@ -51,19 +73,29 @@ class IOArgs: """ filepath_or_buffer: FileOrBuffer - encoding: str - mode: str + encoding: EncodingVar + mode: Union[ModeVar, str] compression: CompressionDict should_close: bool = False + def close(self) -> None: + """ + Close the buffer if it was created by get_filepath_or_buffer. + """ + if self.should_close: + assert not isinstance(self.filepath_or_buffer, str) + try: + self.filepath_or_buffer.close() + except (OSError, ValueError): + pass + self.should_close = False + @dataclasses.dataclass class IOHandles: """ Return value of io/common.py:get_handle - Can be used as a context manager. - This is used to easily close created buffers and to handle corner cases when TextIOWrapper is inserted. @@ -73,7 +105,6 @@ class IOHandles: """ handle: Buffer - compression: CompressionDict created_handles: List[Buffer] = dataclasses.field(default_factory=list) is_wrapped: bool = False is_mmap: bool = False @@ -98,12 +129,6 @@ def close(self) -> None: self.created_handles = [] self.is_wrapped = False - def __enter__(self) -> "IOHandles": - return self - - def __exit__(self, *args: Any) -> None: - self.close() - def is_url(url) -> bool: """ @@ -214,13 +239,18 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def _get_filepath_or_buffer( +# https://github.com/python/mypy/issues/8708 +# error: Incompatible default for argument "encoding" (default has type "None", +# argument has type "str") +# error: Incompatible default for argument "mode" (default has type "None", +# argument has type "str") +def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: str = "utf-8", + encoding: EncodingVar = None, # type: ignore[assignment] compression: CompressionOptions = None, - mode: str = "r", + mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, -) -> IOArgs: +) -> IOArgs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -254,7 +284,12 @@ def _get_filepath_or_buffer( compression_method = infer_compression(filepath_or_buffer, compression_method) # GH21227 internal compression is not used for non-binary handles. - if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: + if ( + compression_method + and hasattr(filepath_or_buffer, "write") + and mode + and "b" not in mode + ): warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, @@ -271,7 +306,8 @@ def _get_filepath_or_buffer( # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( - "w" in mode + mode + and "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"] ): @@ -283,7 +319,7 @@ def _get_filepath_or_buffer( # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. - fsspec_mode = mode + fsspec_mode = mode or "rb" if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" @@ -468,6 +504,7 @@ def infer_compression( ------ ValueError on invalid compression specified. """ + # No compression has been explicitly specified if compression is None: return None @@ -503,7 +540,6 @@ def get_handle( memory_map: bool = False, is_text: bool = True, errors: Optional[str] = None, - storage_options: StorageOptions = None, ) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -547,73 +583,66 @@ def get_handle( Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. - storage_options: StorageOptions = None - Passed to _get_filepath_or_buffer .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ + need_text_wrapping: Tuple[Type["IOBase"], ...] + try: + from s3fs import S3File + + need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) + except ImportError: + need_text_wrapping = (BufferedIOBase, RawIOBase) + # fsspec is an optional dependency. If it is available, add its file-object + # class to the list of classes that need text wrapping. If fsspec is too old and is + # needed, get_filepath_or_buffer would already have thrown an exception. + try: + from fsspec.spec import AbstractFileSystem + + need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) + except ImportError: + pass + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior if encoding is None: encoding = "utf-8" - # read_csv does not know whether the buffer is opened in binary/text mode - if _is_binary_mode(path_or_buf, mode) and "b" not in mode: - mode += "b" - - # open URLs - ioargs = _get_filepath_or_buffer( - path_or_buf, - encoding=encoding, - compression=compression, - mode=mode, - storage_options=storage_options, - ) + # Convert pathlib.Path/py.path.local or string + handle = stringify_path(path_or_buf) - handle = ioargs.filepath_or_buffer - handles: List[Buffer] + compression, compression_args = get_compression_method(compression) + compression = infer_compression(handle, compression) # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors + handle, memory_map, encoding, mode, errors ) is_path = isinstance(handle, str) - compression_args = dict(ioargs.compression) - compression = compression_args.pop("method") - if compression: - # compression libraries do not like an explicit text-mode - ioargs.mode = ioargs.mode.replace("t", "") - # GZ Compression if compression == "gzip": if is_path: assert isinstance(handle, str) - handle = gzip.GzipFile( - filename=handle, - mode=ioargs.mode, - **compression_args, - ) + handle = gzip.GzipFile(filename=handle, mode=mode, **compression_args) else: handle = gzip.GzipFile( fileobj=handle, # type: ignore[arg-type] - mode=ioargs.mode, + mode=mode, **compression_args, ) # BZ Compression elif compression == "bz2": handle = bz2.BZ2File( - handle, # type: ignore[arg-type] - mode=ioargs.mode, - **compression_args, + handle, mode=mode, **compression_args # type: ignore[arg-type] ) # ZIP Compression elif compression == "zip": - handle = _BytesZipFile(handle, ioargs.mode, **compression_args) + handle = _BytesZipFile(handle, mode, **compression_args) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() @@ -629,7 +658,7 @@ def get_handle( # XZ Compression elif compression == "xz": - handle = get_lzma_file(lzma)(handle, ioargs.mode) + handle = get_lzma_file(lzma)(handle, mode) # Unrecognized Compression else: @@ -639,50 +668,42 @@ def get_handle( assert not isinstance(handle, str) handles.append(handle) - elif isinstance(handle, str): + elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. - if ioargs.encoding and "b" not in ioargs.mode: + assert isinstance(handle, str) + if encoding and "b" not in mode: # Encoding - handle = open( - handle, - ioargs.mode, - encoding=ioargs.encoding, - errors=errors, - newline="", - ) + handle = open(handle, mode, encoding=encoding, errors=errors, newline="") else: # Binary mode - handle = open(handle, ioargs.mode) + handle = open(handle, mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False - if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): + if is_text and ( + compression + or isinstance(handle, need_text_wrapping) + or "b" in getattr(handle, "mode", "") + ): handle = TextIOWrapper( handle, # type: ignore[arg-type] - encoding=ioargs.encoding, + encoding=encoding, errors=errors, newline="", ) handles.append(handle) - # only marked as wrapped when the caller provided a handle - is_wrapped = not ( - isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close - ) + # do not mark as wrapped when the user provided a string + is_wrapped = not is_path handles.reverse() # close the most recently added buffer first - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - handles.append(ioargs.filepath_or_buffer) - assert not isinstance(handle, str) return IOHandles( handle=handle, created_handles=handles, is_wrapped=is_wrapped, is_mmap=memory_map, - compression=ioargs.compression, ) @@ -783,7 +804,7 @@ def _maybe_memory_map( mode: str, errors: Optional[str], ) -> Tuple[FileOrBuffer, bool, List[Buffer]]: - """Try to memory map file/buffer.""" + """Try to use memory map file/buffer.""" handles: List[Buffer] = [] memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) if not memory_map: @@ -813,27 +834,3 @@ def _maybe_memory_map( memory_map = False return handle, memory_map, handles - - -def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: - """Test whether file exists.""" - exists = False - filepath_or_buffer = stringify_path(filepath_or_buffer) - if not isinstance(filepath_or_buffer, str): - return exists - try: - exists = os.path.exists(filepath_or_buffer) - # gh-5874: if the filepath is too long will raise here - except (TypeError, ValueError): - pass - return exists - - -def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: - """Whether the handle is opened in binary mode""" - # classes that expect bytes - binary_classes = [BufferedIOBase, RawIOBase] - - return isinstance(handle, tuple(binary_classes)) or "b" in getattr( - handle, "mode", mode - ) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c519baa4c21da..dd30bf37793d0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -16,7 +16,14 @@ from pandas.core.frame import DataFrame -from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg +from pandas.io.common import ( + IOArgs, + get_filepath_or_buffer, + is_url, + stringify_path, + urlopen, + validate_header_arg, +) from pandas.io.excel._util import ( fill_mi_header, get_default_writer, @@ -306,9 +313,7 @@ def read_excel( storage_options: StorageOptions = None, ): - should_close = False if not isinstance(io, ExcelFile): - should_close = True io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( @@ -316,57 +321,66 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) - try: - data = io.parse( - sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - dtype=dtype, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - verbose=verbose, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - ) - finally: - # make sure to close opened file handles - if should_close: - io.close() - return data + return io.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + na_filter=na_filter, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + ) class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): - self.handles = IOHandles( - handle=filepath_or_buffer, compression={"method": None} + self.ioargs = IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=None, + mode=None, + compression={"method": None}, ) - if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - self.handles = get_handle( - filepath_or_buffer, "rb", storage_options=storage_options, is_text=False + # If filepath_or_buffer is a url, load the data into a BytesIO + if is_url(filepath_or_buffer): + self.ioargs = IOArgs( + filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), + should_close=True, + encoding=None, + mode=None, + compression={"method": None}, + ) + elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + self.ioargs = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options ) - if isinstance(self.handles.handle, self._workbook_class): - self.book = self.handles.handle - elif hasattr(self.handles.handle, "read"): + if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): + self.book = self.ioargs.filepath_or_buffer + elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - self.handles.handle.seek(0) - self.book = self.load_workbook(self.handles.handle) - elif isinstance(self.handles.handle, bytes): - self.book = self.load_workbook(BytesIO(self.handles.handle)) + assert not isinstance(self.ioargs.filepath_or_buffer, str) + self.ioargs.filepath_or_buffer.seek(0) + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, str): + self.book = self.load_workbook(self.ioargs.filepath_or_buffer) + elif isinstance(self.ioargs.filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -382,7 +396,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - self.handles.close() + self.ioargs.close() @property @abc.abstractmethod @@ -567,15 +581,9 @@ class ExcelWriter(metaclass=abc.ABCMeta): Format string for datetime objects written into Excel files. (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' - File mode to use (write or append). Append does not work with fsspec URLs. + File mode to use (write or append). .. versionadded:: 0.24.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". - - .. versionadded:: 1.2.0 Attributes ---------- @@ -719,12 +727,11 @@ def save(self): def __init__( self, - path: Union[FilePathOrBuffer, "ExcelWriter"], + path, engine=None, date_format=None, datetime_format=None, - mode: str = "w", - storage_options: StorageOptions = None, + mode="w", **engine_kwargs, ): # validate that this engine can handle the extension @@ -732,20 +739,8 @@ def __init__( ext = os.path.splitext(path)[-1] self.check_extension(ext) - # use mode to open the file - if "b" not in mode: - mode += "b" - # use "a" for the user to append data to excel but internally use "r+" to let - # the excel backend first read the existing file and then write any data to it - mode = mode.replace("a", "r+") - - # cast ExcelWriter to avoid adding 'if self.handles is not None' - self.handles = IOHandles(cast(Buffer, path), compression={"copression": None}) - if not isinstance(path, ExcelWriter): - self.handles = get_handle( - path, mode, storage_options=storage_options, is_text=False - ) - self.sheets: Dict[str, Any] = {} + self.path = path + self.sheets = {} self.cur_sheet = None if date_format is None: @@ -760,7 +755,10 @@ def __init__( self.mode = mode def __fspath__(self): - return getattr(self.handles.handle, "name", "") + # pandas\io\excel\_base.py:744: error: Argument 1 to "stringify_path" + # has incompatible type "Optional[Any]"; expected "Union[str, Path, + # IO[Any], IOBase]" [arg-type] + return stringify_path(self.path) # type: ignore[arg-type] def _get_sheet_name(self, sheet_name): if sheet_name is None: @@ -830,9 +828,7 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self): """synonym for save, to make it more file-like""" - content = self.save() - self.handles.close() - return content + return self.save() def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c5c3927216850..4f9f8a29c0010 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -19,7 +19,7 @@ class ODFReader(BaseExcelReader): filepath_or_buffer : string, path to be parsed or an open readable stream. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ def __init__( @@ -69,7 +69,6 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table - self.close() raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: @@ -191,7 +190,6 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: result = cast(pd.Timestamp, result) return result.time() else: - self.close() raise ValueError(f"Unrecognized type {cell_type}") def _get_cell_string_value(self, cell) -> str: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 0bea19bec2cdd..cbac60dfabaa7 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -3,7 +3,6 @@ from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -15,12 +14,7 @@ class ODSWriter(ExcelWriter): supported_extensions = (".ods",) def __init__( - self, - path: str, - engine: Optional[str] = None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, + self, path: str, engine: Optional[str] = None, mode: str = "w", **engine_kwargs ): from odf.opendocument import OpenDocumentSpreadsheet @@ -29,9 +23,7 @@ def __init__( if mode == "a": raise ValueError("Append mode is not supported with odf!") - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) self.book = OpenDocumentSpreadsheet() self._style_dict: Dict[str, str] = {} @@ -42,7 +34,7 @@ def save(self) -> None: """ for sheet in self.sheets.values(): self.book.spreadsheet.addElement(sheet) - self.book.save(self.handles.handle) + self.book.save(self.path) def write_cells( self, @@ -182,7 +174,7 @@ def _process_style(self, style: Dict[str, Any]) -> str: Returns ------- style_key : str - Unique style key for later reference in sheet + Unique style key for for later reference in sheet """ from odf.style import ( ParagraphProperties, diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7de958df206d5..a5cadf4d93389 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -16,27 +16,16 @@ class OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") - def __init__( - self, - path, - engine=None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, - ): + def __init__(self, path, engine=None, mode="w", **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) - # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from - # the file and later write to it - if "r+" in self.mode: # Load from existing workbook + if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook - self.book = load_workbook(self.handles.handle) + self.book = load_workbook(self.path) else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -48,7 +37,7 @@ def save(self): """ Save workbook to disk. """ - self.book.save(self.handles.handle) + self.book.save(self.path) @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: @@ -463,7 +452,7 @@ def __init__( filepath_or_buffer : string, path object or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") super().__init__(filepath_or_buffer, storage_options=storage_options) @@ -485,7 +474,6 @@ def close(self): # https://stackoverflow.com/questions/31416842/ # openpyxl-does-not-close-excel-workbook-in-read-only-mode self.book.close() - super().close() @property def sheet_names(self) -> List[str]: diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index de4f7bba1a179..ac94f4dd3df74 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -20,7 +20,7 @@ def __init__( filepath_or_buffer : str, path object, or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c655db4bc772b..dfd5dde0329ae 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -18,7 +18,7 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): filepath_or_buffer : string, path object or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index d7bbec578d89d..16c4d377d7610 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -1,7 +1,6 @@ from typing import Dict, List, Tuple import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -169,8 +168,7 @@ def __init__( engine=None, date_format=None, datetime_format=None, - mode: str = "w", - storage_options: StorageOptions = None, + mode="w", **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. @@ -185,11 +183,10 @@ def __init__( date_format=date_format, datetime_format=datetime_format, mode=mode, - storage_options=storage_options, **engine_kwargs, ) - self.book = Workbook(self.handles.handle, **engine_kwargs) + self.book = Workbook(path, **engine_kwargs) def save(self): """ diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 9ede7cd0c2b95..3592c2684f5a5 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -1,7 +1,6 @@ from typing import TYPE_CHECKING, Dict import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -14,15 +13,7 @@ class XlwtWriter(ExcelWriter): engine = "xlwt" supported_extensions = (".xls",) - def __init__( - self, - path, - engine=None, - encoding=None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, - ): + def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt @@ -31,9 +22,7 @@ def __init__( if mode == "a": raise ValueError("Append mode is not supported with xlwt!") - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) if encoding is None: encoding = "ascii" @@ -45,7 +34,7 @@ def save(self): """ Save workbook to disk. """ - self.book.save(self.handles.handle) + self.book.save(self.path) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 422677771b4d0..198acd5862d45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,15 +4,12 @@ from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import doc from pandas import DataFrame, Int64Index, RangeIndex -from pandas.core import generic -from pandas.io.common import get_handle +from pandas.io.common import get_filepath_or_buffer -@doc(storage_options=generic._shared_docs["storage_options"]) def to_feather( df: DataFrame, path: FilePathOrBuffer[AnyStr], @@ -26,7 +23,13 @@ def to_feather( ---------- df : DataFrame path : string file path, or file-like object - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -38,6 +41,8 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather + ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options) + if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -74,13 +79,11 @@ def to_feather( if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - with get_handle( - path, "wb", storage_options=storage_options, is_text=False - ) as handles: - feather.write_feather(df, handles.handle, **kwargs) + feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) + + ioargs.close() -@doc(storage_options=generic._shared_docs["storage_options"]) def read_feather( path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None ): @@ -109,7 +112,13 @@ def read_feather( Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -120,10 +129,12 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - with get_handle( - path, "rb", storage_options=storage_options, is_text=False - ) as handles: + ioargs = get_filepath_or_buffer(path, storage_options=storage_options) - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + df = feather.read_feather( + ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) + ) + + ioargs.close() + + return df diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index ea291bcbfa44c..ab9c9fe995008 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -78,7 +78,7 @@ def check_main(): def in_ipython_frontend(): """ - Check if we're inside an IPython zmq frontend. + Check if we're inside an an IPython zmq frontend. Returns ------- diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index fbda78a1842ca..20226dbb3c9d4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -28,7 +28,7 @@ from pandas.core.indexes.api import Index -from pandas.io.common import get_handle +from pandas.io.common import get_filepath_or_buffer, get_handle if TYPE_CHECKING: from pandas.io.formats.format import DataFrameFormatter @@ -59,11 +59,13 @@ def __init__( self.obj = self.fmt.frame - self.filepath_or_buffer = path_or_buf - self.encoding = encoding - self.compression = compression - self.mode = mode - self.storage_options = storage_options + self.ioargs = get_filepath_or_buffer( + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, + ) self.sep = sep self.index_label = self._initialize_index_label(index_label) @@ -144,7 +146,7 @@ def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes - # and make sure cols is just a list of labels + # and make sure sure cols is just a list of labels new_cols = self.obj.columns if isinstance(new_cols, ABCIndexClass): return new_cols._format_native_types(**self._number_format) @@ -225,15 +227,15 @@ def save(self) -> None: Create the writer & save. """ # apply compression and byte/text conversion - with get_handle( - self.filepath_or_buffer, - self.mode, - encoding=self.encoding, + handles = get_handle( + self.ioargs.filepath_or_buffer, + self.ioargs.mode, + encoding=self.ioargs.encoding, errors=self.errors, - compression=self.compression, - storage_options=self.storage_options, - ) as handles: + compression=self.ioargs.compression, + ) + try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( handles.handle, # type: ignore[arg-type] @@ -247,6 +249,12 @@ def save(self) -> None: self._save() + finally: + # close compression and byte/text wrapper + handles.close() + # close any fsspec-like objects + self.ioargs.close() + def _save(self) -> None: if self._need_to_save_header: self._save_header() diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index bded853f383e0..0916494d8ab60 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,22 +5,21 @@ from functools import reduce import itertools import re -from typing import Callable, Dict, Iterable, Mapping, Optional, Sequence, Union, cast +from typing import Callable, Dict, Mapping, Optional, Sequence, Union import warnings import numpy as np -from pandas._libs.lib import is_list_like -from pandas._typing import Label, StorageOptions -from pandas.util._decorators import doc +from pandas._typing import Label from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar +from pandas.core.dtypes.generic import ABCIndex from pandas import DataFrame, Index, MultiIndex, PeriodIndex -from pandas.core import generic import pandas.core.common as com +from pandas.io.common import stringify_path from pandas.io.formats.css import CSSResolver, CSSWarning from pandas.io.formats.format import get_level_lengths from pandas.io.formats.printing import pprint_thing @@ -31,13 +30,7 @@ class ExcelCell: __slots__ = __fields__ def __init__( - self, - row: int, - col: int, - val, - style=None, - mergestart: Optional[int] = None, - mergeend: Optional[int] = None, + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None ): self.row = row self.col = col @@ -431,7 +424,7 @@ class ExcelFormatter: Format string for floating point numbers cols : sequence, optional Columns to write - header : boolean or sequence of str, default True + header : boolean or list of string, default True Write out column names. If a list of string is given it is assumed to be aliases for the column names index : boolean, default True @@ -530,7 +523,7 @@ def _format_value(self, val): ) return val - def _format_header_mi(self) -> Iterable[ExcelCell]: + def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( @@ -538,7 +531,8 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: "index ('index'=False) is not yet implemented." ) - if not (self._has_aliases or self.header): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if not (has_aliases or self.header): return columns = self.columns @@ -554,30 +548,28 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.merge_cells: # Format multi-index as a merged cells. - for lnum, name in enumerate(columns.names): - yield ExcelCell( - row=lnum, - col=coloffset, - val=name, - style=self.header_style, - ) + for lnum in range(len(level_lengths)): + name = columns.names[lnum] + yield ExcelCell(lnum, coloffset, name, self.header_style) for lnum, (spans, levels, level_codes) in enumerate( zip(level_lengths, columns.levels, columns.codes) ): values = levels.take(level_codes) - for i, span_val in spans.items(): - spans_multiple_cells = span_val > 1 - yield ExcelCell( - row=lnum, - col=coloffset + i + 1, - val=values[i], - style=self.header_style, - mergestart=lnum if spans_multiple_cells else None, - mergeend=( - coloffset + i + span_val if spans_multiple_cells else None - ), - ) + for i in spans: + if spans[i] > 1: + yield ExcelCell( + lnum, + coloffset + i + 1, + values[i], + self.header_style, + lnum, + coloffset + i + spans[i], + ) + else: + yield ExcelCell( + lnum, coloffset + i + 1, values[i], self.header_style + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -586,8 +578,9 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: self.rowcounter = lnum - def _format_header_regular(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_header_regular(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: coloffset = 0 if self.index: @@ -596,11 +589,17 @@ def _format_header_regular(self) -> Iterable[ExcelCell]: coloffset = len(self.df.index[0]) colnames = self.columns - if self._has_aliases: - self.header = cast(Sequence, self.header) - if len(self.header) != len(self.columns): + if has_aliases: + # pandas\io\formats\excel.py:593: error: Argument 1 to "len" + # has incompatible type "Union[Sequence[Optional[Hashable]], + # bool]"; expected "Sized" [arg-type] + if len(self.header) != len(self.columns): # type: ignore[arg-type] + # pandas\io\formats\excel.py:602: error: Argument 1 to + # "len" has incompatible type + # "Union[Sequence[Optional[Hashable]], bool]"; expected + # "Sized" [arg-type] raise ValueError( - f"Writing {len(self.columns)} cols " + f"Writing {len(self.columns)} cols " # type: ignore[arg-type] f"but got {len(self.header)} aliases" ) else: @@ -611,7 +610,7 @@ def _format_header_regular(self) -> Iterable[ExcelCell]: self.rowcounter, colindex + coloffset, colname, self.header_style ) - def _format_header(self) -> Iterable[ExcelCell]: + def _format_header(self): if isinstance(self.columns, MultiIndex): gen = self._format_header_mi() else: @@ -633,14 +632,15 @@ def _format_header(self) -> Iterable[ExcelCell]: self.rowcounter += 1 return itertools.chain(gen, gen2) - def _format_body(self) -> Iterable[ExcelCell]: + def _format_body(self): if isinstance(self.df.index, MultiIndex): return self._format_hierarchical_rows() else: return self._format_regular_rows() - def _format_regular_rows(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_regular_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: self.rowcounter += 1 # output index and index_label? @@ -677,8 +677,9 @@ def _format_regular_rows(self) -> Iterable[ExcelCell]: yield from self._generate_body(coloffset) - def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_hierarchical_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: self.rowcounter += 1 gcolidx = 0 @@ -721,20 +722,23 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: fill_value=levels._na_value, ) - for i, span_val in spans.items(): - spans_multiple_cells = span_val > 1 - yield ExcelCell( - row=self.rowcounter + i, - col=gcolidx, - val=values[i], - style=self.header_style, - mergestart=( - self.rowcounter + i + span_val - 1 - if spans_multiple_cells - else None - ), - mergeend=gcolidx if spans_multiple_cells else None, - ) + for i in spans: + if spans[i] > 1: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx, + ) + else: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + ) gcolidx += 1 else: @@ -742,21 +746,16 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): yield ExcelCell( - row=self.rowcounter + idx, - col=gcolidx, - val=indexcolval, - style=self.header_style, + self.rowcounter + idx, + gcolidx, + indexcolval, + self.header_style, ) gcolidx += 1 yield from self._generate_body(gcolidx) - @property - def _has_aliases(self) -> bool: - """Whether the aliases for column names are present.""" - return is_list_like(self.header) - - def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: + def _generate_body(self, coloffset: int): if self.styler is None: styles = None else: @@ -773,12 +772,11 @@ def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: xlstyle = self.style_converter(";".join(styles[i, colidx])) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) - def get_formatted_cells(self) -> Iterable[ExcelCell]: + def get_formatted_cells(self): for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell - @doc(storage_options=generic._shared_docs["storage_options"]) def write( self, writer, @@ -787,10 +785,9 @@ def write( startcol=0, freeze_panes=None, engine=None, - storage_options: StorageOptions = None, ): """ - writer : path-like, file-like, or ExcelWriter object + writer : string or ExcelWriter object File path or existing ExcelWriter sheet_name : string, default 'Sheet1' Name of sheet which will contain DataFrame @@ -805,9 +802,6 @@ def write( write engine to use if writer is a path - you can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. - {storage_options} - - .. versionadded:: 1.2.0 """ from pandas.io.excel import ExcelWriter @@ -818,7 +812,6 @@ def write( f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) - formatted_cells = self.get_formatted_cells() if isinstance(writer, ExcelWriter): need_save = False else: @@ -826,19 +819,17 @@ def write( # abstract class 'ExcelWriter' with abstract attributes 'engine', # 'save', 'supported_extensions' and 'write_cells' [abstract] writer = ExcelWriter( # type: ignore[abstract] - writer, engine=engine, storage_options=storage_options + stringify_path(writer), engine=engine ) need_save = True - try: - writer.write_cells( - formatted_cells, - sheet_name, - startrow=startrow, - startcol=startcol, - freeze_panes=freeze_panes, - ) - finally: - # make sure to close opened file handles - if need_save: - writer.close() + formatted_cells = self.get_formatted_cells() + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + if need_save: + writer.save() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index db34b882a3c35..e4bd1eddbc5f8 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,6 +5,7 @@ from contextlib import contextmanager from csv import QUOTE_NONE, QUOTE_NONNUMERIC +from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -35,6 +36,7 @@ from pandas._libs import lib from pandas._libs.missing import NA +from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( @@ -637,31 +639,20 @@ def _calc_max_cols_fitted(self) -> Optional[int]: def _calc_max_rows_fitted(self) -> Optional[int]: """Number of rows with data fitting the screen.""" - max_rows: Optional[int] + if not self._is_in_terminal(): + return self.max_rows - if self._is_in_terminal(): - _, height = get_terminal_size() - if self.max_rows == 0: - # rows available to fill with actual data - return height - self._get_number_of_auxillary_rows() + _, height = get_terminal_size() + if self.max_rows == 0: + # rows available to fill with actual data + return height - self._get_number_of_auxillary_rows() - if self._is_screen_short(height): - max_rows = height - else: - max_rows = self.max_rows + max_rows: Optional[int] + if self._is_screen_short(height): + max_rows = height else: max_rows = self.max_rows - return self._adjust_max_rows(max_rows) - - def _adjust_max_rows(self, max_rows: Optional[int]) -> Optional[int]: - """Adjust max_rows using display logic. - - See description here: - https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options - - GH #37359 - """ if max_rows: if (len(self.frame) > max_rows) and self.min_rows: # if truncated, set max_rows showed to min_rows @@ -829,7 +820,7 @@ def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: dtypes = self.frame.dtypes._values # if we have a Float level, they don't use leading space at all - restrict_formatting = any(level.is_floating for level in columns.levels) + restrict_formatting = any(l.is_floating for l in columns.levels) need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): @@ -1527,9 +1518,11 @@ def _format_strings(self) -> List[str]: if self.formatter is not None and callable(self.formatter): return [self.formatter(x) for x in values] - fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format - ) + fmt_values = format_array_from_datetime( + values.asi8.ravel(), + format=get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep, + ).reshape(values.shape) return fmt_values.tolist() @@ -1537,9 +1530,7 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: values = extract_array(self.values, extract_numpy=True) - formatter = self.formatter - if formatter is None: - formatter = values._formatter(boxed=True) + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo @@ -1555,9 +1546,7 @@ def _format_strings(self) -> List[str]: digits=self.digits, space=self.space, justify=self.justify, - decimal=self.decimal, leading_space=self.leading_space, - quoting=self.quoting, ) return fmt_values @@ -1653,21 +1642,30 @@ def is_dates_only( return False -def _format_datetime64(x: Union[NaTType, Timestamp], nat_rep: str = "NaT") -> str: - if x is NaT: +def _format_datetime64( + x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" +) -> str: + if x is None or (is_scalar(x) and isna(x)): return nat_rep + if tz is not None or not isinstance(x, Timestamp): + if getattr(x, "tzinfo", None) is not None: + x = Timestamp(x).tz_convert(tz) + else: + x = Timestamp(x).tz_localize(tz) + return str(x) def _format_datetime64_dateonly( - x: Union[NaTType, Timestamp], - nat_rep: str = "NaT", - date_format: Optional[str] = None, + x: Union[NaTType, Timestamp], nat_rep: str = "NaT", date_format: None = None ) -> str: - if x is NaT: + if x is None or (is_scalar(x) and isna(x)): return nat_rep + if not isinstance(x, Timestamp): + x = Timestamp(x) + if date_format: return x.strftime(date_format) else: @@ -1675,15 +1673,15 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: Optional[str] = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None ) -> Callable: if is_dates_only: - return lambda x: _format_datetime64_dateonly( + return lambda x, tz=None: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) else: - return lambda x: _format_datetime64(x, nat_rep=nat_rep) + return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) def get_format_datetime64_from_values( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 98bd159c567b1..891b3ea7af0e2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,20 +1,10 @@ from abc import ABC, abstractmethod import sys -from typing import ( - IO, - TYPE_CHECKING, - Iterable, - Iterator, - List, - Mapping, - Optional, - Sequence, - Union, -) +from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeriesUnion +from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index @@ -23,6 +13,7 @@ if TYPE_CHECKING: from pandas.core.frame import DataFrame + from pandas.core.series import Series def _put_str(s: Union[str, Dtype], space: int) -> str: @@ -92,12 +83,11 @@ def _initialize_memory_usage( class BaseInfo(ABC): - """ - Base class for DataFrameInfo and SeriesInfo. + """Base class for DataFrameInfo and SeriesInfo. Parameters ---------- - data : DataFrame or Series + data : FrameOrSeries Either dataframe or series. memory_usage : bool or str, optional If "deep", introspect the data deeply by interrogating object dtypes @@ -105,20 +95,18 @@ class BaseInfo(ABC): values. """ - data: FrameOrSeriesUnion - memory_usage: Union[bool, str] + def __init__( + self, + data: FrameOrSeries, + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data = data + self.memory_usage = _initialize_memory_usage(memory_usage) @property @abstractmethod - def dtypes(self) -> Iterable[Dtype]: - """ - Dtypes. - - Returns - ------- - dtypes : sequence - Dtype of each of the DataFrame's columns (or one series column). - """ + def ids(self) -> Index: + """Column names or index names.""" @property @abstractmethod @@ -132,15 +120,30 @@ def non_null_counts(self) -> Sequence[int]: @property @abstractmethod - def memory_usage_bytes(self) -> int: + def dtypes(self) -> "Series": + """Dtypes. + + Returns + ------- + dtypes : Series + Dtype of each of the DataFrame's columns. """ - Memory usage in bytes. + return self.data.dtypes + + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. Returns ------- memory_usage_bytes : int Object's total memory usage in bytes. """ + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() @property def memory_usage_string(self) -> str: @@ -162,8 +165,49 @@ def size_qualifier(self) -> str: size_qualifier = "+" return size_qualifier - @abstractmethod - def render( + +class DataFrameInfo(BaseInfo): + """Class storing dataframe-specific info.""" + + @property + def ids(self) -> Index: + """Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def dtypes(self) -> "Series": + """Dtypes. + + Returns + ------- + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + # groupby dtype.name to collect e.g. Categorical columns + return self.dtypes.value_counts().groupby(lambda x: x.name).sum() + + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns.""" + return self.data.count() + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + + def to_buffer( self, *, buf: Optional[IO[str]], @@ -176,7 +220,6 @@ def render( This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. - %(version_added_sub)s\ Parameters ---------- @@ -203,7 +246,12 @@ def render( consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - %(show_counts_sub)s + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. Returns ------- @@ -218,76 +266,7 @@ def render( -------- %(examples_sub)s """ - - -class DataFrameInfo(BaseInfo): - """ - Class storing dataframe-specific info. - """ - - def __init__( - self, - data: "DataFrame", - memory_usage: Optional[Union[bool, str]] = None, - ): - self.data: "DataFrame" = data - self.memory_usage = _initialize_memory_usage(memory_usage) - - @property - def dtype_counts(self) -> Mapping[str, int]: - return _get_dataframe_dtype_counts(self.data) - - @property - def dtypes(self) -> Iterable[Dtype]: - """ - Dtypes. - - Returns - ------- - dtypes - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property - def ids(self) -> Index: - """ - Column names. - - Returns - ------- - ids : Index - DataFrame's column names. - """ - return self.data.columns - - @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return len(self.ids) - - @property - def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns or column (if series).""" - return self.data.count() - - @property - def memory_usage_bytes(self) -> int: - if self.memory_usage == "deep": - deep = True - else: - deep = False - return self.data.memory_usage(index=True, deep=deep).sum() - - def render( - self, - *, - buf: Optional[IO[str]], - max_cols: Optional[int], - verbose: Optional[bool], - show_counts: Optional[bool], - ) -> None: - printer = DataFrameInfoPrinter( + printer = InfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -296,27 +275,8 @@ def render( printer.to_buffer(buf) -class InfoPrinterAbstract: - """ - Class for printing dataframe or series info. - """ - - def to_buffer(self, buf: Optional[IO[str]] = None) -> None: - """Save dataframe info into buffer.""" - table_builder = self._create_table_builder() - lines = table_builder.get_lines() - if buf is None: # pragma: no cover - buf = sys.stdout - fmt.buffer_put_lines(buf, lines) - - @abstractmethod - def _create_table_builder(self) -> "TableBuilderAbstract": - """Create instance of table builder.""" - - -class DataFrameInfoPrinter(InfoPrinterAbstract): - """ - Class for printing dataframe info. +class InfoPrinter: + """Class for printing dataframe or series info. Parameters ---------- @@ -374,6 +334,14 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: else: return show_counts + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + def _create_table_builder(self) -> "DataFrameTableBuilder": """ Create instance of table builder based on verbosity and display settings. @@ -396,73 +364,26 @@ def _create_table_builder(self) -> "DataFrameTableBuilder": class TableBuilderAbstract(ABC): - """ - Abstract builder for info table. + """Abstract builder for info table. + + Parameters + ---------- + info : BaseInfo + Instance of DataFrameInfo or SeriesInfo. """ _lines: List[str] - info: BaseInfo + + def __init__(self, *, info): + self.info = info @abstractmethod def get_lines(self) -> List[str]: """Product in a form of list of lines (strings).""" - @property - def data(self) -> FrameOrSeriesUnion: - return self.info.data - - @property - def dtypes(self) -> Iterable[Dtype]: - """Dtypes of each of the DataFrame's columns.""" - return self.info.dtypes - - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - return self.info.dtype_counts - - @property - def display_memory_usage(self) -> bool: - """Whether to display memory usage.""" - return bool(self.info.memory_usage) - - @property - def memory_usage_string(self) -> str: - """Memory usage string with proper size qualifier.""" - return self.info.memory_usage_string - - @property - def non_null_counts(self) -> Sequence[int]: - return self.info.non_null_counts - - def add_object_type_line(self) -> None: - """Add line with string representation of dataframe to the table.""" - self._lines.append(str(type(self.data))) - - def add_index_range_line(self) -> None: - """Add line with range of indices to the table.""" - self._lines.append(self.data.index._summary()) - - def add_dtypes_line(self) -> None: - """Add summary line with dtypes present in dataframe.""" - collected_dtypes = [ - f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) - ] - self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") - class DataFrameTableBuilder(TableBuilderAbstract): - """ - Abstract builder for dataframe info table. - - Parameters - ---------- - info : DataFrameInfo. - Instance of DataFrameInfo. - """ - - def __init__(self, *, info: DataFrameInfo): - self.info: DataFrameInfo = info + """Abstract builder for dataframe info table.""" def get_lines(self) -> List[str]: self._lines = [] @@ -478,62 +399,144 @@ def _fill_empty_info(self) -> None: self.add_index_range_line() self._lines.append(f"Empty {type(self.data).__name__}") - @abstractmethod def _fill_non_empty_info(self) -> None: """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() @property def data(self) -> "DataFrame": """DataFrame.""" return self.info.data + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return self.info.memory_usage + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + @property def ids(self) -> Index: """Dataframe columns.""" return self.info.ids + @property + def dtypes(self) -> "Series": + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + @property def col_count(self) -> int: """Number of dataframe columns to be summarized.""" return self.info.col_count + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + @abstractmethod + def add_columns_summary_line(self) -> None: + """Add line with columns summary to the table.""" + + @abstractmethod + def add_header_line(self) -> None: + """Add header line to the table.""" + + @abstractmethod + def add_separator_line(self) -> None: + """Add separator line between header and body of the table.""" + + @abstractmethod + def add_body_lines(self) -> None: + """Add content of the table body.""" + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + def add_memory_usage_line(self) -> None: """Add line containing memory usage.""" self._lines.append(f"memory usage: {self.memory_usage_string}") class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): - """ - Dataframe info table builder for non-verbose output. - """ - - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() + """Info table builder for non-verbose output.""" def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) + def add_header_line(self) -> None: + """No header in non-verbose output.""" -class TableBuilderVerboseMixin(TableBuilderAbstract): - """ - Mixin for verbose info output. - """ + def add_separator_line(self) -> None: + """No separator in non-verbose output.""" - SPACING: str = " " * 2 - strrows: Sequence[Sequence[str]] - gross_column_widths: Sequence[int] - with_counts: bool + def add_body_lines(self) -> None: + """No body in non-verbose output.""" + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder): + """Info table builder for verbose output.""" + + SPACING = " " * 2 + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + super().__init__(info=info) + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() @property - @abstractmethod def headers(self) -> Sequence[str]: """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def _gen_rows(self) -> Iterator[Sequence[str]]: + """Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") @property def header_column_widths(self) -> Sequence[int]: @@ -553,25 +556,6 @@ def _get_body_column_widths(self) -> Sequence[int]: strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) return [max(len(x) for x in col) for col in strcols] - def _gen_rows(self) -> Iterator[Sequence[str]]: - """ - Generator function yielding rows content. - - Each element represents a row comprising a sequence of strings. - """ - if self.with_counts: - return self._gen_rows_with_counts() - else: - return self._gen_rows_without_counts() - - @abstractmethod - def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data with counts.""" - - @abstractmethod - def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data without counts.""" - def add_header_line(self) -> None: header_line = self.SPACING.join( [ @@ -602,55 +586,6 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) - def _gen_non_null_counts(self) -> Iterator[str]: - """Iterator with string representation of non-null counts.""" - for count in self.non_null_counts: - yield f"{count} non-null" - - def _gen_dtypes(self) -> Iterator[str]: - """Iterator with string representation of column dtypes.""" - for dtype in self.dtypes: - yield pprint_thing(dtype) - - -class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): - """ - Dataframe info table builder for verbose output. - """ - - def __init__( - self, - *, - info: DataFrameInfo, - with_counts: bool, - ): - self.info = info - self.with_counts = with_counts - self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() - - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_header_line() - self.add_separator_line() - self.add_body_lines() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() - - @property - def headers(self) -> Sequence[str]: - """Headers names of the columns in verbose table.""" - if self.with_counts: - return [" # ", "Column", "Non-Null Count", "Dtype"] - return [" # ", "Column", "Dtype"] - - def add_columns_summary_line(self) -> None: - self._lines.append(f"Data columns (total {self.col_count} columns):") - def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: """Iterator with string representation of body data without counts.""" yield from zip( @@ -678,10 +613,12 @@ def _gen_columns(self) -> Iterator[str]: for col in self.ids: yield pprint_thing(col) + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) -def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: - """ - Create mapping between datatypes and their number of occurences. - """ - # groupby dtype.name to collect e.g. Categorical columns - return df.dtypes.value_counts().groupby(lambda x: x.name).sum() + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index ac453839792f3..72b07000146b2 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -308,7 +308,7 @@ def format_object_summary( name : name, optional defaults to the class name of the obj indent_for_name : bool, default True - Whether subsequent lines should be indented to + Whether subsequent lines should be be indented to align with the name. line_break_each_value : bool, default False If True, inserts a line break for each value of ``obj``. diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0eeff44d0f74c..2f3416cbf2d87 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,6 +1,7 @@ """ Module for applying conditional formatting to DataFrames and Series. """ + from collections import defaultdict from contextlib import contextmanager import copy @@ -32,7 +33,6 @@ import pandas as pd from pandas.api.types import is_dict_like, is_list_like -from pandas.core import generic import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -204,11 +204,7 @@ def _repr_html_(self) -> str: """ return self.render() - @doc( - NDFrame.to_excel, - klass="Styler", - storage_options=generic._shared_docs["storage_options"], - ) + @doc(NDFrame.to_excel, klass="Styler") def to_excel( self, excel_writer, @@ -565,6 +561,7 @@ def set_td_classes(self, classes: DataFrame) -> "Styler": ' 1' ' ' '' + """ classes = classes.reindex_like(self.data) @@ -903,7 +900,7 @@ def set_table_attributes(self, attributes: str) -> "Styler": Set the table attributes. These are the items that show up in the opening ```` tag - in addition to automatic (by default) id. + in addition to to automatic (by default) id. Parameters ---------- @@ -990,46 +987,20 @@ def set_caption(self, caption: str) -> "Styler": self.caption = caption return self - def set_table_styles(self, table_styles, axis=0, overwrite=True) -> "Styler": + def set_table_styles(self, table_styles) -> "Styler": """ Set the table styles on a Styler. These are placed in a ``