From 39f1e8fd293d21212b52f2126e526ea636d30e07 Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 12:17:52 +0800
Subject: [PATCH 01/42] ENH: Add 'end' option in resample's origin
---
pandas/tests/resample/test_resample_api.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 29f2aea1648ec..1bd3ecd6d2366 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -611,3 +611,17 @@ def test_resample_agg_readonly():
result = rs.agg("min")
tm.assert_series_equal(result, expected)
+
+
+def test_resample_end_origin():
+ # GH#37804
+ idx = pd.date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s')
+ data = np.ones(len(idx))
+ s = pd.Series(data, index=idx)
+ result = s.resample('7min', origin='end', closed='right').sum()
+
+ exp_idx = pd.date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T')
+ exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.]
+ expected = pd.Series(exp_data, index=exp_idx)
+
+ tm.assert_series_equal(result, expected)
From cd5aa643dc5f2aa05d17dfe0f8e08a694c0a9e4d Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 12:20:44 +0800
Subject: [PATCH 02/42] Update resample.py
---
pandas/core/resample.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index fccedd75c4531..f6f1ae3524df7 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1404,14 +1404,16 @@ def __init__(
self.fill_method = fill_method
self.limit = limit
- if origin in ("epoch", "start", "start_day"):
+ if origin in ("epoch", "start", "start_day", "end"):
+ if origin == "end" and self.closed == "left":
+ raise ValueError("'closed' has to be 'right' when 'origin' is 'end'.")
self.origin = origin
else:
try:
self.origin = Timestamp(origin)
except Exception as e:
raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day' or "
+ "'origin' should be equal to 'epoch', 'start', 'start_day' 'end' or "
f"should be a Timestamp convertible type. Got '{origin}' instead."
) from e
@@ -1846,6 +1848,10 @@ def _adjust_dates_anchored(
origin_nanos = first.value
elif isinstance(origin, Timestamp):
origin_nanos = origin.value
+ elif origin == 'end':
+ sub_freq_times = (last.value - first.value) // freq.nanos
+ first = last - sub_freq_times * freq
+ origin_nanos = first.value
origin_nanos += offset.value if offset else 0
# GH 10117 & GH 19375. If first and last contain timezone information,
From 0184b1deeacfc2cfd641dcc9adb5a8469200f4d5 Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 12:24:17 +0800
Subject: [PATCH 03/42] Update resample.py
---
pandas/core/resample.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index f6f1ae3524df7..293fcc086952c 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1413,8 +1413,9 @@ def __init__(
self.origin = Timestamp(origin)
except Exception as e:
raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day' 'end' or "
- f"should be a Timestamp convertible type. Got '{origin}' instead."
+ "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' "
+ f"or should be a Timestamp convertible type. Got '{origin}' "
+ "instead."
) from e
try:
From ff35b6f33daf9aa19c517e44a49bf462c0a6855e Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 12:48:05 +0800
Subject: [PATCH 04/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 1bd3ecd6d2366..623d5d496f939 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -615,13 +615,13 @@ def test_resample_agg_readonly():
def test_resample_end_origin():
# GH#37804
- idx = pd.date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s')
+ idx = date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s')
data = np.ones(len(idx))
- s = pd.Series(data, index=idx)
+ s = Series(data, index=idx)
result = s.resample('7min', origin='end', closed='right').sum()
- exp_idx = pd.date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T')
+ exp_idx = date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T')
exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.]
- expected = pd.Series(exp_data, index=exp_idx)
+ expected = Series(exp_data, index=exp_idx)
tm.assert_series_equal(result, expected)
From 8c4549edce77048bfb2c0eb1f39b39a02f30a35b Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 12:58:58 +0800
Subject: [PATCH 05/42] Update resample.py
---
pandas/core/resample.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 293fcc086952c..2b42fefa9247c 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1849,7 +1849,7 @@ def _adjust_dates_anchored(
origin_nanos = first.value
elif isinstance(origin, Timestamp):
origin_nanos = origin.value
- elif origin == 'end':
+ elif origin == "end":
sub_freq_times = (last.value - first.value) // freq.nanos
first = last - sub_freq_times * freq
origin_nanos = first.value
From b835d1ae79e2e8090e5d25da4db58fd4dea42a7c Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 13:00:27 +0800
Subject: [PATCH 06/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 623d5d496f939..a7c4cc1696808 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -615,13 +615,13 @@ def test_resample_agg_readonly():
def test_resample_end_origin():
# GH#37804
- idx = date_range('20200101 8:26:35', '20200101 9:31:58', freq='77s')
+ idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s")
data = np.ones(len(idx))
s = Series(data, index=idx)
- result = s.resample('7min', origin='end', closed='right').sum()
+ result = s.resample("7min", origin="end", closed="right").sum()
- exp_idx = date_range('2020-01-01 08:20:45', '2020-01-01 09:23:45', freq='7T')
- exp_data = [1., 6., 5., 6., 5., 6., 5., 6., 5., 6.]
+ exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T")
+ exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6]
expected = Series(exp_data, index=exp_idx)
tm.assert_series_equal(result, expected)
From bf15c67842c3b5c479ddee4f002245a77a600609 Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 16:02:28 +0800
Subject: [PATCH 07/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index a7c4cc1696808..aeaf4d099eacb 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -621,7 +621,7 @@ def test_resample_end_origin():
result = s.resample("7min", origin="end", closed="right").sum()
exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T")
- exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6]
+ exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0]
expected = Series(exp_data, index=exp_idx)
tm.assert_series_equal(result, expected)
From e4b01d87f4ad8897b3d4629f0dc4b8f28b013b79 Mon Sep 17 00:00:00 2001
From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
Date: Fri, 13 Nov 2020 16:03:58 +0800
Subject: [PATCH 08/42] Update test_datetime_index.py
---
pandas/tests/resample/test_datetime_index.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index d3d33d6fe847e..c94a687addb47 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -770,8 +770,8 @@ def test_resample_bad_origin(origin):
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s")
ts = Series(np.random.randn(len(rng)), index=rng)
msg = (
- "'origin' should be equal to 'epoch', 'start', 'start_day' or "
- f"should be a Timestamp convertible type. Got '{origin}' instead."
+ "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' "
+ f"or should be a Timestamp convertible type. Got '{origin}' instead."
)
with pytest.raises(ValueError, match=msg):
ts.resample("5min", origin=origin)
From d096ccd1e9fe1137d20344bda6a3bd67197998b2 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 16:42:14 +0800
Subject: [PATCH 09/42] add backward para and end_day option
---
pandas/core/generic.py | 2 +
pandas/core/resample.py | 62 ++++++++++++++------
pandas/tests/resample/test_datetime_index.py | 2 +-
pandas/tests/resample/test_resample_api.py | 2 +-
4 files changed, 49 insertions(+), 19 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 24c1ae971686e..a24d0c780838b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -7952,6 +7952,7 @@ def resample(
on=None,
level=None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
+ backward: Optional[bool] = None,
offset: Optional[TimedeltaConvertibleTypes] = None,
) -> Resampler:
"""
@@ -8337,6 +8338,7 @@ def resample(
key=on,
level=level,
origin=origin,
+ backward=backward,
offset=offset,
)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 2b42fefa9247c..5b74782315d99 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1366,6 +1366,7 @@ def __init__(
convention: Optional[str] = None,
base: Optional[int] = None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
+ backward: Optional[bool] = None,
offset: Optional[TimedeltaConvertibleTypes] = None,
**kwargs,
):
@@ -1389,9 +1390,15 @@ def __init__(
label = "right"
else:
if closed is None:
- closed = "left"
+ if origin in ["end", "end_day"] or backward:
+ closed = "right"
+ else:
+ closed = "left"
if label is None:
- label = "left"
+ if origin in ["end", "end_day"] or backward:
+ label = "right"
+ else:
+ label = "left"
self.closed = closed
self.label = label
@@ -1404,20 +1411,32 @@ def __init__(
self.fill_method = fill_method
self.limit = limit
- if origin in ("epoch", "start", "start_day", "end"):
- if origin == "end" and self.closed == "left":
- raise ValueError("'closed' has to be 'right' when 'origin' is 'end'.")
+ if origin in ("epoch", "start", "start_day", "end", "end_day"):
self.origin = origin
else:
try:
self.origin = Timestamp(origin)
except Exception as e:
raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' "
+ "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' "
f"or should be a Timestamp convertible type. Got '{origin}' "
"instead."
) from e
+ if backward is None:
+ if self.origin in ("end", "end_day"):
+ self.backward = True
+ else:
+ self.backward = False
+ elif backward:
+ if origin in ("start", "start_day"):
+ raise ValueError(f"`start` or `start_day` origin isn't allowed when `backward` is True")
+ self.backward = backward
+ else:
+ if origin in ("end", "end_day"):
+ raise ValueError(f"`end` or `end_day` origin isn't allowed when `backward` is False")
+ self.backward = backward
+
try:
self.offset = Timedelta(offset) if offset is not None else None
except Exception as e:
@@ -1505,6 +1524,7 @@ def _get_time_bins(self, ax):
self.freq,
closed=self.closed,
origin=self.origin,
+ backward=self.backward,
offset=self.offset,
)
# GH #12037
@@ -1658,6 +1678,7 @@ def _get_period_bins(self, ax: PeriodIndex):
self.freq,
closed=self.closed,
origin=self.origin,
+ backward=self.backward,
offset=self.offset,
)
@@ -1711,7 +1732,7 @@ def _take_new_index(obj, indexer, new_index, axis=0):
def _get_timestamp_range_edges(
- first, last, freq, closed="left", origin="start_day", offset=None
+ first, last, freq, closed="left", origin="start_day", backward=False, offset=None
):
"""
Adjust the `first` Timestamp to the preceding Timestamp that resides on
@@ -1764,7 +1785,7 @@ def _get_timestamp_range_edges(
origin = origin.tz_localize(None)
first, last = _adjust_dates_anchored(
- first, last, freq, closed=closed, origin=origin, offset=offset
+ first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset
)
if isinstance(freq, Day):
first = first.tz_localize(index_tz)
@@ -1784,7 +1805,7 @@ def _get_timestamp_range_edges(
def _get_period_range_edges(
- first, last, freq, closed="left", origin="start_day", offset=None
+ first, last, freq, closed="left", origin="start_day", backward=False, offset=None
):
"""
Adjust the provided `first` and `last` Periods to the respective Period of
@@ -1826,7 +1847,7 @@ def _get_period_range_edges(
adjust_last = freq.is_on_offset(last)
first, last = _get_timestamp_range_edges(
- first, last, freq, closed=closed, origin=origin, offset=offset
+ first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset
)
first = (first + int(adjust_first) * freq).to_period(freq)
@@ -1835,7 +1856,7 @@ def _get_period_range_edges(
def _adjust_dates_anchored(
- first, last, freq, closed="right", origin="start_day", offset=None
+ first, last, freq, closed="right", origin="start_day", backward=False, offset=None
):
# First and last offsets should be calculated from the start day to fix an
# error cause by resampling across multiple days when a one day period is
@@ -1847,12 +1868,19 @@ def _adjust_dates_anchored(
origin_nanos = first.normalize().value
elif origin == "start":
origin_nanos = first.value
- elif isinstance(origin, Timestamp):
- origin_nanos = origin.value
- elif origin == "end":
- sub_freq_times = (last.value - first.value) // freq.nanos
- first = last - sub_freq_times * freq
- origin_nanos = first.value
+ elif isinstance(origin, Timestamp) or origin in ("end", "end_day"):
+ if backward:
+ if origin == "end":
+ origin = last
+ elif origin == "end_day":
+ origin = last.ceil('D')
+ sub_freq_times = (origin.value - first.value) // freq.nanos
+ if closed == "left":
+ sub_freq_times += 1
+ first = origin - sub_freq_times * freq
+ origin_nanos = first.value
+ else:
+ origin_nanos = origin.value
origin_nanos += offset.value if offset else 0
# GH 10117 & GH 19375. If first and last contain timezone information,
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index c94a687addb47..156b24e0baf30 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -770,7 +770,7 @@ def test_resample_bad_origin(origin):
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s")
ts = Series(np.random.randn(len(rng)), index=rng)
msg = (
- "'origin' should be equal to 'epoch', 'start', 'start_day', 'end' "
+ "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' "
f"or should be a Timestamp convertible type. Got '{origin}' instead."
)
with pytest.raises(ValueError, match=msg):
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index aeaf4d099eacb..54f4c8189037e 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -620,7 +620,7 @@ def test_resample_end_origin():
s = Series(data, index=idx)
result = s.resample("7min", origin="end", closed="right").sum()
- exp_idx = date_range("2020-01-01 08:20:45", "2020-01-01 09:23:45", freq="7T")
+ exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T")
exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0]
expected = Series(exp_data, index=exp_idx)
From 222ef8dd9ece689909ce0b5c3dfda426ff9f0959 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 17:42:29 +0800
Subject: [PATCH 10/42] add doc-string
---
pandas/core/generic.py | 44 +++++++++++++++++++++++++++++++---
pandas/core/groupby/grouper.py | 30 ++++++++++++++++++++---
pandas/core/resample.py | 26 ++++++++++++++++----
3 files changed, 89 insertions(+), 11 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index a24d0c780838b..1f2d2d3228774 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -7973,8 +7973,9 @@ def resample(
`DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
closed : {'right', 'left'}, default None
Which side of bin interval is closed. The default is 'left'
- for all frequency offsets except for 'M', 'A', 'Q', 'BM',
- 'BA', 'BQ', and 'W' which all have a default of 'right'.
+ for all frequency offsets with forward resampling except for 'M',
+ 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of
+ 'right'. When `Backward` set to be True, default is 'right'.
label : {'right', 'left'}, default None
Which bin edge label to label bucket with. The default is 'left'
for all frequency offsets except for 'M', 'A', 'Q', 'BM',
@@ -8007,7 +8008,7 @@ def resample(
level : str or int, optional
For a MultiIndex, level (name or number) to use for
resampling. `level` must be datetime-like.
- origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day'
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day'
The timestamp on which to adjust the grouping. The timezone of origin
must match the timezone of the index.
If a timestamp is not used, these values are also supported:
@@ -8018,6 +8019,21 @@ def resample(
.. versionadded:: 1.1.0
+ - 'end': `origin` is the last value of the timeseries
+ - 'end_day': `origin` is the ceiling midnight of the last day
+
+ .. versionadded:: 1.2.0
+
+ backward : bool, default is None
+ Resample on the given `origin` from a backward direction. True when
+ `origin` is 'end' or 'end_day'. False when `origin` is 'start' or
+ 'start_day'. Optional when using datetime `origin` , and default
+ False. The resample result for a specified datetime stands for the
+ group from time substract the given `freq` to time with a right
+ `closed` setting by default.
+
+ .. versionadded:: 1.2.0
+
offset : Timedelta or str, default is None
An offset timedelta added to the origin.
@@ -8297,6 +8313,28 @@ def resample(
2000-10-02 00:21:00 24
Freq: 17T, dtype: int64
+ If you want to take the last timestamp as `origin` with a backward resample:
+
+ >>> ts.index.max()
+ Timestamp('2000-10-02 00:26:00', freq='7T')
+ >>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum()
+ 2000-10-01 23:35:00 0
+ 2000-10-01 23:52:00 18
+ 2000-10-02 00:09:00 27
+ 2000-10-02 00:26:00 63
+ Freq: 17T, dtype: int32
+
+ You can also specify the backward origin:
+
+ >>> ts.groupby(pd.Grouper(freq='17min',
+ origin='2000-10-02 00:30:00',
+ backward=True)).sum()
+ 2000-10-01 23:39:00 3
+ 2000-10-01 23:56:00 15
+ 2000-10-02 00:13:00 45
+ 2000-10-02 00:30:00 45
+ Freq: 17T, dtype: int32
+
To replace the use of the deprecated `base` argument, you can now use `offset`,
in this example it is equivalent to have `base=2`:
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index e8af9da30a298..7889c50ca3df7 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -83,9 +83,9 @@ class Grouper:
However, loffset is also deprecated for ``.resample(...)``
See: :class:`DataFrame.resample`
- origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day'
+ The timestamp on which to adjust the grouping. The timezone of origin
+ must match the timezone of the index.
If a timestamp is not used, these values are also supported:
- 'epoch': `origin` is 1970-01-01
@@ -94,6 +94,21 @@ class Grouper:
.. versionadded:: 1.1.0
+ - 'end': `origin` is the last value of the timeseries
+ - 'end_day': `origin` is the ceiling midnight of the last day
+
+ .. versionadded:: 1.2.0
+
+ backward : bool, default is None
+ Resample on the given `origin` from a backward direction. True when
+ `origin` is 'end' or 'end_day'. False when `origin` is 'start' or
+ 'start_day'. Optional when using datetime `origin` , and default
+ False. The resample result for a specified datetime stands for the
+ group from time substract the given `freq` to time with a right
+ `closed` setting by default.
+
+ .. versionadded:: 1.2.0
+
offset : Timedelta or str, default is None
An offset timedelta added to the origin.
@@ -200,6 +215,15 @@ class Grouper:
2000-10-02 00:15:00 45
Freq: 17T, dtype: int64
+ If you want to take the last timestamp as `origin` with a backward resample:
+
+ >>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum()
+ 2000-10-01 23:39:00 0
+ 2000-10-01 23:56:00 0
+ 2000-10-02 00:13:00 3
+ 2000-10-02 00:30:00 6
+ Freq: 17T, dtype: int32
+
If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 5b74782315d99..3d7389b724819 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1430,11 +1430,15 @@ def __init__(
self.backward = False
elif backward:
if origin in ("start", "start_day"):
- raise ValueError(f"`start` or `start_day` origin isn't allowed when `backward` is True")
+ raise ValueError(
+ f"`start` or `start_day` origin isn't allowed when `backward` is True"
+ )
self.backward = backward
else:
if origin in ("end", "end_day"):
- raise ValueError(f"`end` or `end_day` origin isn't allowed when `backward` is False")
+ raise ValueError(
+ f"`end` or `end_day` origin isn't allowed when `backward` is False"
+ )
self.backward = backward
try:
@@ -1785,7 +1789,13 @@ def _get_timestamp_range_edges(
origin = origin.tz_localize(None)
first, last = _adjust_dates_anchored(
- first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset
+ first,
+ last,
+ freq,
+ closed=closed,
+ origin=origin,
+ backward=backward,
+ offset=offset,
)
if isinstance(freq, Day):
first = first.tz_localize(index_tz)
@@ -1847,7 +1857,13 @@ def _get_period_range_edges(
adjust_last = freq.is_on_offset(last)
first, last = _get_timestamp_range_edges(
- first, last, freq, closed=closed, origin=origin, backward=backward, offset=offset
+ first,
+ last,
+ freq,
+ closed=closed,
+ origin=origin,
+ backward=backward,
+ offset=offset,
)
first = (first + int(adjust_first) * freq).to_period(freq)
@@ -1873,7 +1889,7 @@ def _adjust_dates_anchored(
if origin == "end":
origin = last
elif origin == "end_day":
- origin = last.ceil('D')
+ origin = last.ceil("D")
sub_freq_times = (origin.value - first.value) // freq.nanos
if closed == "left":
sub_freq_times += 1
From 90c9c5f073d71341c93ba6994f390b53831aa1bd Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 18:40:44 +0800
Subject: [PATCH 11/42] add test cases
---
pandas/core/resample.py | 2 +-
pandas/tests/resample/test_resample_api.py | 115 ++++++++++++++++++++-
2 files changed, 115 insertions(+), 2 deletions(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 3d7389b724819..cb0d91e0e5398 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1438,7 +1438,7 @@ def __init__(
if origin in ("end", "end_day"):
raise ValueError(
f"`end` or `end_day` origin isn't allowed when `backward` is False"
- )
+ )
self.backward = backward
try:
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 54f4c8189037e..63b678b04a654 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -613,8 +613,121 @@ def test_resample_agg_readonly():
tm.assert_series_equal(result, expected)
-def test_resample_end_origin():
+def test_backward_resample():
# GH#37804
+
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
+ # test consistency of backward and origin
+ msg = "`start` or `start_day` origin isn't allowed when `backward` is True"
+ with pytest.raises(ValueError, match=msg):
+ ts.resample("1min", origin="start", backward=True)
+ msg = "`end` or `end_day` origin isn't allowed when `backward` is False"
+ with pytest.raises(ValueError, match=msg):
+ ts.resample("1min", origin="end", backward=False)
+
+ # test end origin
+ res = ts.resample("17min", origin="end").sum().astype("int64")
+ data = [0, 18, 27, 63]
+ expected = Series(
+ data,
+ index=date_range(
+ end="20001002 00:26:00",
+ freq="17min",
+ periods=4,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ # test end_day origin
+ # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29
+ res = ts.resample("17min", origin="end_day").sum().astype("int64")
+ data = [3, 15, 45, 45]
+ expected = Series(
+ data,
+ index=date_range(
+ end="2000-10-02 00:29:00",
+ freq="17min",
+ periods=4,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ # test datetime origin with backward resample
+ res = ts.resample(
+ "17min",
+ origin="2000-10-02 00:40:00",
+ backward=True,
+ ).sum().astype("int64")
+ data = [0, 9, 36, 39, 24]
+ expected = Series(
+ data,
+ index=date_range(
+ end="2000-10-02 00:40:00",
+ freq="17min",
+ periods=5,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ res = ts.resample(
+ "17min",
+ origin="2000-10-02 01:05:00",
+ backward=True,
+ ).sum().astype("int64")
+ data = [3, 15, 45, 45]
+ expected = Series(
+ data,
+ index=date_range(
+ end="2000-10-02 00:31:00",
+ freq="17min",
+ periods=4,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ # test right and left close
+ res = ts.resample(
+ "17min",
+ origin="end",
+ closed="right",
+ ).sum().astype("int64")
+ data = [0, 18, 27, 63]
+ expected = Series(
+ data,
+ index=date_range(
+ end="2000-10-02 00:26:00 ",
+ freq="17min",
+ periods=4,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ res = ts.resample(
+ "17min",
+ origin="end",
+ closed="left",
+ ).sum().astype("int64")
+ data = [0, 18, 27, 39, 24]
+ expected = Series(
+ data,
+ index=date_range(
+ end="2000-10-02 00:43:00",
+ freq="17min",
+ periods=5,
+ )
+ )
+
+ tm.assert_series_equal(res, expected)
+
+ # original test case
idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s")
data = np.ones(len(idx))
s = Series(data, index=idx)
From eae898ccf44068bc163769b54f31608f4351a91b Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 18:57:22 +0800
Subject: [PATCH 12/42] fix format
---
pandas/core/generic.py | 3 +-
pandas/core/groupby/grouper.py | 3 +-
pandas/core/resample.py | 8 ++++--
pandas/tests/resample/test_resample_api.py | 32 +++++++++++-----------
4 files changed, 25 insertions(+), 21 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 1f2d2d3228774..98fef283eb8d9 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8008,7 +8008,8 @@ def resample(
level : str or int, optional
For a MultiIndex, level (name or number) to use for
resampling. `level` must be datetime-like.
- origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day'
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp
+ or str, default 'start_day'
The timestamp on which to adjust the grouping. The timezone of origin
must match the timezone of the index.
If a timestamp is not used, these values are also supported:
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 7889c50ca3df7..b040fe993100b 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -83,7 +83,8 @@ class Grouper:
However, loffset is also deprecated for ``.resample(...)``
See: :class:`DataFrame.resample`
- origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp or str, default 'start_day'
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp
+ or str, default 'start_day'
The timestamp on which to adjust the grouping. The timezone of origin
must match the timezone of the index.
If a timestamp is not used, these values are also supported:
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index cb0d91e0e5398..58cafb5d151de 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1418,8 +1418,9 @@ def __init__(
self.origin = Timestamp(origin)
except Exception as e:
raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day', 'end', 'end_day' "
- f"or should be a Timestamp convertible type. Got '{origin}' "
+ "'origin' should be equal to 'epoch', 'start', 'start_day',"
+ " 'end', 'end_day' or should be a Timestamp convertible"
+ f" type. Got '{origin}' "
"instead."
) from e
@@ -1431,7 +1432,8 @@ def __init__(
elif backward:
if origin in ("start", "start_day"):
raise ValueError(
- f"`start` or `start_day` origin isn't allowed when `backward` is True"
+ f"`start` or `start_day` origin isn't allowed when "
+ "`backward` is True"
)
self.backward = backward
else:
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 63b678b04a654..0cf081c3adb96 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -659,10 +659,10 @@ def test_backward_resample():
# test datetime origin with backward resample
res = ts.resample(
- "17min",
- origin="2000-10-02 00:40:00",
- backward=True,
- ).sum().astype("int64")
+ "17min",
+ origin="2000-10-02 00:40:00",
+ backward=True,
+ ).sum().astype("int64")
data = [0, 9, 36, 39, 24]
expected = Series(
data,
@@ -676,10 +676,10 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
res = ts.resample(
- "17min",
- origin="2000-10-02 01:05:00",
- backward=True,
- ).sum().astype("int64")
+ "17min",
+ origin="2000-10-02 01:05:00",
+ backward=True,
+ ).sum().astype("int64")
data = [3, 15, 45, 45]
expected = Series(
data,
@@ -694,10 +694,10 @@ def test_backward_resample():
# test right and left close
res = ts.resample(
- "17min",
- origin="end",
- closed="right",
- ).sum().astype("int64")
+ "17min",
+ origin="end",
+ closed="right",
+ ).sum().astype("int64")
data = [0, 18, 27, 63]
expected = Series(
data,
@@ -711,10 +711,10 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
res = ts.resample(
- "17min",
- origin="end",
- closed="left",
- ).sum().astype("int64")
+ "17min",
+ origin="end",
+ closed="left",
+ ).sum().astype("int64")
data = [0, 18, 27, 39, 24]
expected = Series(
data,
From 2ee1000d7144481354b60bb1b70470424b2936f0 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 18:59:13 +0800
Subject: [PATCH 13/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 0cf081c3adb96..6d2bfaf0ac181 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -637,7 +637,7 @@ def test_backward_resample():
end="20001002 00:26:00",
freq="17min",
periods=4,
- )
+ ),
)
tm.assert_series_equal(res, expected)
@@ -652,7 +652,7 @@ def test_backward_resample():
end="2000-10-02 00:29:00",
freq="17min",
periods=4,
- )
+ ),
)
tm.assert_series_equal(res, expected)
@@ -670,7 +670,7 @@ def test_backward_resample():
end="2000-10-02 00:40:00",
freq="17min",
periods=5,
- )
+ ),
)
tm.assert_series_equal(res, expected)
@@ -687,7 +687,7 @@ def test_backward_resample():
end="2000-10-02 00:31:00",
freq="17min",
periods=4,
- )
+ ),
)
tm.assert_series_equal(res, expected)
@@ -705,7 +705,7 @@ def test_backward_resample():
end="2000-10-02 00:26:00 ",
freq="17min",
periods=4,
- )
+ ),
)
tm.assert_series_equal(res, expected)
@@ -722,7 +722,7 @@ def test_backward_resample():
end="2000-10-02 00:43:00",
freq="17min",
periods=5,
- )
+ ),
)
tm.assert_series_equal(res, expected)
From 3442e007af697442d333c29b1c9bc5b4a54fd957 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 19:08:46 +0800
Subject: [PATCH 14/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 6d2bfaf0ac181..17229684d015a 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -710,11 +710,15 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
- res = ts.resample(
- "17min",
- origin="end",
- closed="left",
- ).sum().astype("int64")
+ res = (
+ ts.resample(
+ "17min",
+ origin="end",
+ closed="left",
+ )
+ .sum()
+ .astype("int64")
+ )
data = [0, 18, 27, 39, 24]
expected = Series(
data,
From a33acacf24afd1d8c3e02a3a45d55f5a63b61e92 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 19:17:00 +0800
Subject: [PATCH 15/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 17229684d015a..f6e8c56b5f332 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -693,11 +693,15 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
# test right and left close
- res = ts.resample(
- "17min",
- origin="end",
- closed="right",
- ).sum().astype("int64")
+ res = (
+ ts.resample(
+ "17min",
+ origin="end",
+ closed="right",
+ )
+ .sum()
+ .astype("int64")
+ )
data = [0, 18, 27, 63]
expected = Series(
data,
From 7c5483994be5543c9bff2615dfeb252851dd3548 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 19:33:22 +0800
Subject: [PATCH 16/42] Update test_resample_api.py
---
pandas/tests/resample/test_resample_api.py | 28 ++++++++++++++--------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index f6e8c56b5f332..6b000cae7c1dc 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -658,11 +658,15 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
# test datetime origin with backward resample
- res = ts.resample(
- "17min",
- origin="2000-10-02 00:40:00",
- backward=True,
- ).sum().astype("int64")
+ res = (
+ ts.resample(
+ "17min",
+ origin="2000-10-02 00:40:00",
+ backward=True,
+ )
+ .sum()
+ .astype("int64")
+ )
data = [0, 9, 36, 39, 24]
expected = Series(
data,
@@ -675,11 +679,15 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
- res = ts.resample(
- "17min",
- origin="2000-10-02 01:05:00",
- backward=True,
- ).sum().astype("int64")
+ res = (
+ ts.resample(
+ "17min",
+ origin="2000-10-02 01:05:00",
+ backward=True,
+ )
+ .sum()
+ .astype("int64")
+ )
data = [3, 15, 45, 45]
expected = Series(
data,
From a4e0a3919e42d14fb3c9961f4dc86d9e02e8f664 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 21:39:11 +0800
Subject: [PATCH 17/42] flake8 fix
---
pandas/core/resample.py | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 58cafb5d151de..53e565a966769 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1418,10 +1418,9 @@ def __init__(
self.origin = Timestamp(origin)
except Exception as e:
raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day',"
- " 'end', 'end_day' or should be a Timestamp convertible"
- f" type. Got '{origin}' "
- "instead."
+ "'origin' should be equal to 'epoch', 'start', 'start_day', "
+ "'end', 'end_day' or should be a Timestamp convertible "
+ f"type. Got '{origin}' instead."
) from e
if backward is None:
@@ -1432,14 +1431,14 @@ def __init__(
elif backward:
if origin in ("start", "start_day"):
raise ValueError(
- f"`start` or `start_day` origin isn't allowed when "
+ "`start` or `start_day` origin isn't allowed when "
"`backward` is True"
)
self.backward = backward
else:
if origin in ("end", "end_day"):
raise ValueError(
- f"`end` or `end_day` origin isn't allowed when `backward` is False"
+ "`end` or `end_day` origin isn't allowed when `backward` is False"
)
self.backward = backward
From 0e2e390a020f872d359689dba67ab91f5f4cb660 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 22:50:08 +0800
Subject: [PATCH 18/42] break lines
---
pandas/core/generic.py | 4 ++--
pandas/core/groupby/grouper.py | 2 +-
pandas/core/resample.py | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 98fef283eb8d9..82e948bb39bdb 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -7952,7 +7952,7 @@ def resample(
on=None,
level=None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
- backward: Optional[bool] = None,
+ backward: Optional[bool_t] = None,
offset: Optional[TimedeltaConvertibleTypes] = None,
) -> Resampler:
"""
@@ -8008,7 +8008,7 @@ def resample(
level : str or int, optional
For a MultiIndex, level (name or number) to use for
resampling. `level` must be datetime-like.
- origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \
or str, default 'start_day'
The timestamp on which to adjust the grouping. The timezone of origin
must match the timezone of the index.
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index b040fe993100b..5dbd8d92e7cbb 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -83,7 +83,7 @@ class Grouper:
However, loffset is also deprecated for ``.resample(...)``
See: :class:`DataFrame.resample`
- origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp
+ origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \
or str, default 'start_day'
The timestamp on which to adjust the grouping. The timezone of origin
must match the timezone of the index.
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 53e565a966769..8b391d087ce6f 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1366,7 +1366,7 @@ def __init__(
convention: Optional[str] = None,
base: Optional[int] = None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
- backward: Optional[bool] = None,
+ backward: Optional[bool_t] = None,
offset: Optional[TimedeltaConvertibleTypes] = None,
**kwargs,
):
From 9f4844a629622ca91669112361dedb0e63165285 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 27 Nov 2020 23:34:35 +0800
Subject: [PATCH 19/42] Update resample.py
---
pandas/core/resample.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 8b391d087ce6f..53e565a966769 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1366,7 +1366,7 @@ def __init__(
convention: Optional[str] = None,
base: Optional[int] = None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
- backward: Optional[bool_t] = None,
+ backward: Optional[bool] = None,
offset: Optional[TimedeltaConvertibleTypes] = None,
**kwargs,
):
From 5b7f396d53ce8c73300d705da54170a4f321825a Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Sat, 28 Nov 2020 00:04:13 +0800
Subject: [PATCH 20/42] fix docstring
---
pandas/core/generic.py | 2 +-
pandas/core/groupby/grouper.py | 10 +++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 82e948bb39bdb..a063648c3c215 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8323,7 +8323,7 @@ def resample(
2000-10-01 23:52:00 18
2000-10-02 00:09:00 27
2000-10-02 00:26:00 63
- Freq: 17T, dtype: int32
+ Freq: 17T, dtype: int64
You can also specify the backward origin:
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 5dbd8d92e7cbb..261190747ee61 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -219,11 +219,11 @@ class Grouper:
If you want to take the last timestamp as `origin` with a backward resample:
>>> ts.groupby(pd.Grouper(freq='17min', origin='end')).sum()
- 2000-10-01 23:39:00 0
- 2000-10-01 23:56:00 0
- 2000-10-02 00:13:00 3
- 2000-10-02 00:30:00 6
- Freq: 17T, dtype: int32
+ 2000-10-01 23:35:00 0
+ 2000-10-01 23:52:00 18
+ 2000-10-02 00:09:00 27
+ 2000-10-02 00:26:00 63
+ Freq: 17T, dtype: int64
If you want to adjust the start of the bins with an `offset` Timedelta, the two
following lines are equivalent:
From 115c92a3627b2ba002623eb6b10224725bfa94fb Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Sat, 28 Nov 2020 09:14:25 +0800
Subject: [PATCH 21/42] split tests
---
pandas/core/generic.py | 4 +-
pandas/tests/resample/test_resample_api.py | 52 +++++++++++++---------
2 files changed, 32 insertions(+), 24 deletions(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index a063648c3c215..4b1865b2a34d8 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8328,8 +8328,8 @@ def resample(
You can also specify the backward origin:
>>> ts.groupby(pd.Grouper(freq='17min',
- origin='2000-10-02 00:30:00',
- backward=True)).sum()
+ ... origin='2000-10-02 00:30:00',
+ ... backward=True)).sum()
2000-10-01 23:39:00 3
2000-10-01 23:56:00 15
2000-10-02 00:13:00 45
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 6b000cae7c1dc..54f2beab25d8b 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -613,14 +613,14 @@ def test_resample_agg_readonly():
tm.assert_series_equal(result, expected)
-def test_backward_resample():
- # GH#37804
+# test data for backward resample GH#37804
+start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+rng = date_range(start, end, freq="7min")
+ts = Series(np.arange(len(rng)) * 3, index=rng)
- start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
- rng = date_range(start, end, freq="7min")
- ts = Series(np.arange(len(rng)) * 3, index=rng)
- # test consistency of backward and origin
+def test_backward_origin_consistency():
+
msg = "`start` or `start_day` origin isn't allowed when `backward` is True"
with pytest.raises(ValueError, match=msg):
ts.resample("1min", origin="start", backward=True)
@@ -628,7 +628,9 @@ def test_backward_resample():
with pytest.raises(ValueError, match=msg):
ts.resample("1min", origin="end", backward=False)
- # test end origin
+
+def test_end_origin():
+
res = ts.resample("17min", origin="end").sum().astype("int64")
data = [0, 18, 27, 63]
expected = Series(
@@ -642,7 +644,21 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
- # test end_day origin
+ # an extra test case
+ idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s")
+ data = np.ones(len(idx))
+ s = Series(data, index=idx)
+ result = s.resample("7min", origin="end", closed="right").sum()
+
+ exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T")
+ exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0]
+ expected = Series(exp_data, index=exp_idx)
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_end_day_origin():
+
# 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29
res = ts.resample("17min", origin="end_day").sum().astype("int64")
data = [3, 15, 45, 45]
@@ -657,7 +673,9 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
- # test datetime origin with backward resample
+
+def test_backward_resample_with_datetime_origin():
+
res = (
ts.resample(
"17min",
@@ -700,7 +718,9 @@ def test_backward_resample():
tm.assert_series_equal(res, expected)
- # test right and left close
+
+def test_left_and_right_close_in_backward_resample():
+
res = (
ts.resample(
"17min",
@@ -742,15 +762,3 @@ def test_backward_resample():
)
tm.assert_series_equal(res, expected)
-
- # original test case
- idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s")
- data = np.ones(len(idx))
- s = Series(data, index=idx)
- result = s.resample("7min", origin="end", closed="right").sum()
-
- exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T")
- exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0]
- expected = Series(exp_data, index=exp_idx)
-
- tm.assert_series_equal(result, expected)
From 7d8d67a07dfcdc9fa39f5fa11902efc0738bd813 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Sat, 28 Nov 2020 09:35:44 +0800
Subject: [PATCH 22/42] Update generic.py
---
pandas/core/generic.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 4b1865b2a34d8..222cf0af5869b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8334,7 +8334,7 @@ def resample(
2000-10-01 23:56:00 15
2000-10-02 00:13:00 45
2000-10-02 00:30:00 45
- Freq: 17T, dtype: int32
+ Freq: 17T, dtype: int64
To replace the use of the deprecated `base` argument, you can now use `offset`,
in this example it is equivalent to have `base=2`:
From 77fc4a3fc0acaf1783f7ed87b53da6688b0f6395 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Sat, 28 Nov 2020 15:38:36 +0800
Subject: [PATCH 23/42] doc added & tests fix
---
doc/source/user_guide/timeseries.rst | 44 ++++++++++++++++++++++
doc/source/whatsnew/v1.2.0.rst | 44 ++++++++++++++++++++++
pandas/tests/resample/test_resample_api.py | 26 ++++++++++---
3 files changed, 108 insertions(+), 6 deletions(-)
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index ac8ba2fd929a6..8044172bc4c4a 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -1897,6 +1897,50 @@ Those two examples are equivalent for this time series:
Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries.
+Backward resample
+~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.2.0
+
+``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+
+.. ipython:: python
+
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+
+ ts.index.max()
+ ts.resample("17min", origin="end").sum()
+
+Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+
+.. ipython:: python
+
+ ts.resample("17min", origin="end").sum()
+
+If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
+
+.. ipython:: python
+
+ ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum()
+
+You can implement ``offset='end_day'`` in the following method equivalently.
+
+.. ipython:: python
+
+ end_day_origin = ts.index.max().ceil("D")
+ end_day_origin
+ ts.resample("17min", origin=end_day_origin, backward=True).sum()
+
+By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
+
+.. ipython:: python
+
+ ts.resample("17min", closed="left", origin="end").sum()
+
.. _timeseries.periods:
Time span representation
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index f751a91cecf19..d1899e1d72509 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -203,6 +203,50 @@ example where the index name is preserved:
The same is true for :class:`MultiIndex`, but the logic is applied separately on a
level-by-level basis.
+.. _whatsnew_120.backward_resample:
+
+Backward resample
+^^^^^^^^^^^^^^^^^
+
+:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+
+.. ipython:: python
+
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+
+ ts.index.max()
+ ts.resample("17min", origin="end").sum()
+
+Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+
+.. ipython:: python
+
+ ts.resample("17min", origin="end").sum()
+
+If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
+
+.. ipython:: python
+
+ ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum()
+
+You can implement ``offset='end_day'`` in the following method equivalently.
+
+.. ipython:: python
+
+ end_day_origin = ts.index.max().ceil("D")
+ end_day_origin
+ ts.resample("17min", origin=end_day_origin, backward=True).sum()
+
+By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
+
+.. ipython:: python
+
+ ts.resample("17min", closed="left", origin="end").sum()
+
.. _whatsnew_120.enhancements.other:
Other enhancements
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 54f2beab25d8b..3b13e16b3df82 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -613,14 +613,12 @@ def test_resample_agg_readonly():
tm.assert_series_equal(result, expected)
-# test data for backward resample GH#37804
-start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
-rng = date_range(start, end, freq="7min")
-ts = Series(np.arange(len(rng)) * 3, index=rng)
-
-
def test_backward_origin_consistency():
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
msg = "`start` or `start_day` origin isn't allowed when `backward` is True"
with pytest.raises(ValueError, match=msg):
ts.resample("1min", origin="start", backward=True)
@@ -631,6 +629,10 @@ def test_backward_origin_consistency():
def test_end_origin():
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
res = ts.resample("17min", origin="end").sum().astype("int64")
data = [0, 18, 27, 63]
expected = Series(
@@ -659,6 +661,10 @@ def test_end_origin():
def test_end_day_origin():
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
# 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29
res = ts.resample("17min", origin="end_day").sum().astype("int64")
data = [3, 15, 45, 45]
@@ -676,6 +682,10 @@ def test_end_day_origin():
def test_backward_resample_with_datetime_origin():
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
res = (
ts.resample(
"17min",
@@ -721,6 +731,10 @@ def test_backward_resample_with_datetime_origin():
def test_left_and_right_close_in_backward_resample():
+ start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
+
res = (
ts.resample(
"17min",
From b49229367fc3ab02e81c8c373d05c021560054f2 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Sat, 28 Nov 2020 18:31:07 +0800
Subject: [PATCH 24/42] fix doc
---
doc/source/user_guide/timeseries.rst | 28 ++++++++++++++------------
doc/source/whatsnew/v1.2.0.rst | 30 +++++++++-------------------
2 files changed, 24 insertions(+), 34 deletions(-)
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index bee72ec70d95e..843da644848b1 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -1888,31 +1888,39 @@ Those two examples are equivalent for this time series:
Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries.
+.. _timeseries.backward-resample:
+
Backward resample
~~~~~~~~~~~~~~~~~
.. versionadded:: 1.2.0
-``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq``, but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
.. ipython:: python
start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
- rng = date_range(start, end, freq="7min")
- ts = Series(np.arange(len(rng)) * 3, index=rng)
+ rng = pd.date_range(start, end, freq="7min")
+ ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
-Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``.
ts.index.max()
ts.resample("17min", origin="end").sum()
-Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+The forward resample output stands for the grouping result from current datetimeindex to the next one with ``closed=left`` by default. In contrast, the backward resample output stands for the grouping result from former datetimeindex to the current one with ``closed=right`` by default. If you want to change this, ``closed=left`` is available.
.. ipython:: python
- ts.resample("17min", origin="end").sum()
+ ts.resample("17min", closed="left", origin="end").sum()
+
+Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``.
+
+.. ipython:: python
-If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
+ ts.resample("17min", origin="end_day").sum()
+
+If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set.
.. ipython:: python
@@ -1926,12 +1934,6 @@ You can implement ``offset='end_day'`` in the following method equivalently.
end_day_origin
ts.resample("17min", origin=end_day_origin, backward=True).sum()
-By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
-
-.. ipython:: python
-
- ts.resample("17min", closed="left", origin="end").sum()
-
.. _timeseries.periods:
Time span representation
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index ac8132339d38c..d45813960d5c2 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -206,47 +206,35 @@ level-by-level basis.
.. _whatsnew_120.backward_resample:
-Backward resample
+Backward resample
^^^^^^^^^^^^^^^^^
-:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward``. ``'end'`` and ``'end_day'`` are available in argument ``offset``. Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
.. ipython:: python
start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
- rng = date_range(start, end, freq="7min")
- ts = Series(np.arange(len(rng)) * 3, index=rng)
+ rng = pd.date_range(start, end, freq="7min")
+ ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
-Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``.
ts.index.max()
ts.resample("17min", origin="end").sum()
-Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
+Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``.
.. ipython:: python
- ts.resample("17min", origin="end").sum()
+ ts.resample("17min", origin="end_day").sum()
-If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
+If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set.
.. ipython:: python
ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum()
-You can implement ``offset='end_day'`` in the following method equivalently.
-
-.. ipython:: python
-
- end_day_origin = ts.index.max().ceil("D")
- end_day_origin
- ts.resample("17min", origin=end_day_origin, backward=True).sum()
-
-By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
-
-.. ipython:: python
-
- ts.resample("17min", closed="left", origin="end").sum()
+For details, see: :ref:`timeseries.backward-resample`.
.. _whatsnew_120.groupby_ewm:
From 76a015acf5e00542f48202481898f31ada5789e1 Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 11 Dec 2020 12:52:55 +0800
Subject: [PATCH 25/42] Revert "Merge remote-tracking branch 'upstream/master'"
This reverts commit 561096c991eff2c3cca2f2be7ecc5dd3b2f6a97d, reversing
changes made to b49229367fc3ab02e81c8c373d05c021560054f2.
---
Dockerfile | 15 +-
README.md | 38 +-
asv_bench/benchmarks/indexing.py | 8 -
asv_bench/benchmarks/rolling.py | 14 -
ci/deps/azure-38-numpydev.yaml | 2 +-
doc/source/development/contributing.rst | 5 +-
doc/source/reference/series.rst | 1 +
doc/source/user_guide/io.rst | 40 +-
doc/source/whatsnew/v1.1.5.rst | 5 -
doc/source/whatsnew/v1.2.0.rst | 100 +----
pandas/_libs/groupby.pyx | 12 +-
pandas/_libs/reduction.pyx | 4 +-
pandas/_libs/tslibs/period.pyx | 11 +-
pandas/_testing.py | 43 +-
pandas/_typing.py | 4 +-
pandas/compat/numpy/function.py | 16 +-
pandas/conftest.py | 39 +-
pandas/core/algorithms.py | 75 ++--
pandas/core/apply.py | 27 +-
pandas/core/arrays/_mixins.py | 2 +-
pandas/core/arrays/base.py | 22 +-
pandas/core/arrays/boolean.py | 7 +-
pandas/core/arrays/categorical.py | 12 +-
pandas/core/arrays/datetimelike.py | 21 +-
pandas/core/arrays/floating.py | 76 +++-
pandas/core/arrays/integer.py | 81 +++-
pandas/core/arrays/interval.py | 92 ++--
pandas/core/arrays/numeric.py | 92 ----
pandas/core/arrays/numpy_.py | 18 +-
pandas/core/arrays/sparse/array.py | 2 +-
pandas/core/arrays/string_.py | 16 +-
pandas/core/arrays/timedeltas.py | 4 +-
pandas/core/base.py | 14 +-
pandas/core/computation/pytables.py | 2 +-
pandas/core/construction.py | 18 -
pandas/core/dtypes/cast.py | 208 +++++----
pandas/core/dtypes/concat.py | 28 +-
pandas/core/dtypes/dtypes.py | 8 +-
pandas/core/dtypes/generic.py | 75 +---
pandas/core/frame.py | 39 +-
pandas/core/generic.py | 118 +++--
pandas/core/groupby/base.py | 4 -
pandas/core/groupby/generic.py | 188 ++++----
pandas/core/groupby/groupby.py | 216 ++++-----
pandas/core/groupby/grouper.py | 12 +-
pandas/core/groupby/ops.py | 162 +++----
pandas/core/indexers.py | 3 -
pandas/core/indexes/base.py | 170 ++-----
pandas/core/indexes/category.py | 25 +-
pandas/core/indexes/datetimelike.py | 20 +-
pandas/core/indexes/datetimes.py | 3 +-
pandas/core/indexes/extension.py | 2 +-
pandas/core/indexes/interval.py | 73 +--
pandas/core/indexes/multi.py | 67 ++-
pandas/core/indexes/numeric.py | 33 +-
pandas/core/indexes/period.py | 33 +-
pandas/core/indexes/range.py | 33 +-
pandas/core/indexing.py | 28 +-
pandas/core/internals/blocks.py | 33 +-
pandas/core/internals/managers.py | 21 +-
pandas/core/missing.py | 140 ++----
pandas/core/nanops.py | 19 +-
pandas/core/ops/__init__.py | 5 +-
pandas/core/ops/array_ops.py | 34 +-
pandas/core/ops/methods.py | 58 +--
pandas/core/resample.py | 2 +-
pandas/core/reshape/melt.py | 4 +-
pandas/core/reshape/merge.py | 28 +-
pandas/core/reshape/pivot.py | 123 ++----
pandas/core/series.py | 56 +--
pandas/core/shared_docs.py | 2 +-
pandas/core/sorting.py | 21 +-
pandas/core/strings/accessor.py | 112 +++--
pandas/core/util/numba_.py | 2 +-
pandas/core/window/rolling.py | 38 +-
pandas/io/common.py | 16 +-
pandas/io/excel/_base.py | 98 +----
pandas/io/excel/_xlwt.py | 4 +-
pandas/io/formats/csvs.py | 14 +-
pandas/io/formats/latex.py | 4 +-
pandas/io/formats/printing.py | 2 +-
pandas/io/formats/style.py | 20 +-
pandas/io/html.py | 5 +-
pandas/io/json/_json.py | 15 +-
pandas/io/parquet.py | 61 +--
pandas/io/parsers.py | 28 +-
pandas/io/pytables.py | 16 +-
pandas/io/sas/sasreader.py | 17 +-
pandas/io/stata.py | 2 +-
pandas/plotting/_matplotlib/timeseries.py | 5 +-
pandas/tests/arithmetic/conftest.py | 12 +
pandas/tests/arithmetic/test_timedelta64.py | 2 +-
.../tests/arrays/boolean/test_arithmetic.py | 13 +-
pandas/tests/arrays/boolean/test_function.py | 7 -
.../tests/arrays/categorical/test_missing.py | 8 +-
pandas/tests/arrays/floating/test_function.py | 7 -
.../tests/arrays/integer/test_arithmetic.py | 52 +--
pandas/tests/arrays/integer/test_function.py | 8 -
pandas/tests/arrays/masked/test_arithmetic.py | 6 +-
.../tests/arrays/sparse/test_arithmetics.py | 45 +-
pandas/tests/arrays/sparse/test_libsparse.py | 77 ++--
pandas/tests/arrays/string_/test_string.py | 21 -
pandas/tests/arrays/test_datetimelike.py | 25 +-
pandas/tests/base/test_conversion.py | 21 +-
.../tests/dtypes/cast/test_convert_objects.py | 12 +
pandas/tests/dtypes/test_common.py | 20 +-
pandas/tests/dtypes/test_dtypes.py | 8 -
pandas/tests/dtypes/test_inference.py | 4 +-
pandas/tests/extension/arrow/test_bool.py | 4 -
pandas/tests/extension/base/interface.py | 23 -
pandas/tests/extension/base/methods.py | 8 +-
pandas/tests/extension/decimal/array.py | 8 -
pandas/tests/extension/json/test_json.py | 7 -
pandas/tests/extension/test_boolean.py | 6 +-
pandas/tests/extension/test_categorical.py | 22 -
pandas/tests/extension/test_floating.py | 4 -
pandas/tests/extension/test_integer.py | 9 +-
pandas/tests/extension/test_string.py | 4 -
pandas/tests/frame/apply/test_frame_apply.py | 6 -
pandas/tests/frame/common.py | 4 +-
pandas/tests/frame/indexing/test_indexing.py | 22 +-
pandas/tests/frame/indexing/test_setitem.py | 18 -
pandas/tests/frame/indexing/test_where.py | 12 +-
pandas/tests/frame/methods/test_astype.py | 2 +-
.../tests/frame/methods/test_combine_first.py | 103 ++---
pandas/tests/frame/methods/test_convert.py | 4 +-
pandas/tests/frame/methods/test_diff.py | 8 +-
.../frame/methods/test_drop_duplicates.py | 9 -
pandas/tests/frame/methods/test_fillna.py | 6 +-
pandas/tests/frame/methods/test_rename.py | 4 +-
pandas/tests/frame/methods/test_replace.py | 2 +-
.../tests/frame/methods/test_reset_index.py | 2 +-
.../tests/frame/methods/test_sort_values.py | 8 +-
pandas/tests/frame/methods/test_to_csv.py | 12 +-
pandas/tests/frame/methods/test_to_records.py | 57 ++-
pandas/tests/frame/test_arithmetic.py | 24 +-
pandas/tests/frame/test_constructors.py | 14 +-
pandas/tests/frame/test_nonunique_indexes.py | 2 +-
pandas/tests/frame/test_query_eval.py | 2 +-
pandas/tests/frame/test_reductions.py | 32 +-
pandas/tests/frame/test_stack_unstack.py | 55 +--
pandas/tests/frame/test_ufunc.py | 2 +-
pandas/tests/frame/test_validate.py | 2 +-
pandas/tests/generic/test_duplicate_labels.py | 6 +-
pandas/tests/groupby/test_apply.py | 26 +-
pandas/tests/groupby/test_function.py | 411 ++++++++---------
pandas/tests/groupby/test_groupby.py | 12 +-
pandas/tests/groupby/test_grouping.py | 5 +-
pandas/tests/groupby/test_nth.py | 20 -
pandas/tests/groupby/test_quantile.py | 27 +-
pandas/tests/groupby/test_rank.py | 77 +---
pandas/tests/groupby/test_timegrouper.py | 2 +-
pandas/tests/groupby/test_value_counts.py | 10 +-
.../tests/groupby/transform/test_transform.py | 54 +--
.../tests/indexes/base_class/test_setops.py | 2 +-
.../indexes/categorical/test_category.py | 8 +-
pandas/tests/indexes/common.py | 5 +-
.../tests/indexes/datetimes/test_datetime.py | 51 +--
pandas/tests/indexes/datetimes/test_setops.py | 17 +-
pandas/tests/indexes/interval/test_formats.py | 29 +-
pandas/tests/indexes/interval/test_setops.py | 6 +-
pandas/tests/indexes/multi/test_analytics.py | 3 +-
pandas/tests/indexes/multi/test_drop.py | 12 -
pandas/tests/indexes/multi/test_setops.py | 43 +-
pandas/tests/indexes/period/test_indexing.py | 53 ---
.../indexes/period/test_partial_slicing.py | 2 +-
pandas/tests/indexes/ranges/test_setops.py | 56 +--
pandas/tests/indexes/test_base.py | 21 +-
pandas/tests/indexes/test_numpy_compat.py | 6 +-
pandas/tests/indexes/test_setops.py | 58 +--
.../indexes/timedeltas/test_timedelta.py | 11 +-
.../indexing/multiindex/test_multiindex.py | 10 -
.../tests/indexing/multiindex/test_slice.py | 10 -
pandas/tests/indexing/test_categorical.py | 9 +-
pandas/tests/indexing/test_indexers.py | 6 -
pandas/tests/indexing/test_loc.py | 36 +-
pandas/tests/indexing/test_scalar.py | 70 ++-
pandas/tests/io/conftest.py | 3 -
pandas/tests/io/excel/test_openpyxl.py | 13 +-
pandas/tests/io/excel/test_readers.py | 6 +-
pandas/tests/io/excel/test_style.py | 17 +-
pandas/tests/io/excel/test_writers.py | 59 ++-
pandas/tests/io/excel/test_xlrd.py | 46 +-
pandas/tests/io/excel/test_xlsxwriter.py | 19 +-
pandas/tests/io/formats/test_to_excel.py | 4 +-
pandas/tests/io/formats/test_to_html.py | 4 +-
pandas/tests/io/formats/test_to_latex.py | 2 +-
pandas/tests/io/json/test_compression.py | 6 +-
pandas/tests/io/json/test_normalize.py | 10 +-
pandas/tests/io/json/test_pandas.py | 22 +-
pandas/tests/io/json/test_readlines.py | 37 +-
pandas/tests/io/json/test_ujson.py | 28 +-
pandas/tests/io/parser/conftest.py | 2 +-
pandas/tests/io/parser/test_c_parser_only.py | 12 +-
pandas/tests/io/parser/test_common.py | 118 ++---
pandas/tests/io/parser/test_compression.py | 4 +-
pandas/tests/io/parser/test_converters.py | 2 +-
pandas/tests/io/parser/test_dtypes.py | 20 +-
pandas/tests/io/parser/test_mangle_dupes.py | 2 +-
pandas/tests/io/parser/test_multi_thread.py | 19 +-
pandas/tests/io/parser/test_network.py | 46 +-
pandas/tests/io/parser/test_parse_dates.py | 6 +-
pandas/tests/io/parser/test_textreader.py | 6 +-
pandas/tests/io/pytables/test_store.py | 120 ++---
pandas/tests/io/sas/test_sas7bdat.py | 39 +-
pandas/tests/io/sas/test_xport.py | 41 +-
pandas/tests/io/test_common.py | 12 +-
pandas/tests/io/test_html.py | 12 +-
pandas/tests/io/test_parquet.py | 55 +--
pandas/tests/io/test_stata.py | 10 +-
pandas/tests/plotting/common.py | 7 -
pandas/tests/plotting/frame/test_frame.py | 85 +++-
.../tests/plotting/frame/test_frame_color.py | 29 +-
.../plotting/frame/test_frame_groupby.py | 2 -
.../plotting/frame/test_frame_subplots.py | 77 ++--
pandas/tests/plotting/test_backend.py | 3 -
pandas/tests/plotting/test_boxplot_method.py | 30 +-
pandas/tests/plotting/test_common.py | 2 -
pandas/tests/plotting/test_converter.py | 3 -
pandas/tests/plotting/test_datetimelike.py | 57 ++-
pandas/tests/plotting/test_groupby.py | 2 -
pandas/tests/plotting/test_hist_method.py | 20 +-
pandas/tests/plotting/test_misc.py | 14 +-
pandas/tests/plotting/test_series.py | 32 +-
pandas/tests/plotting/test_style.py | 2 -
pandas/tests/resample/test_datetime_index.py | 8 +-
pandas/tests/resample/test_time_grouper.py | 28 +-
.../tests/reshape/concat/test_categorical.py | 14 +-
pandas/tests/reshape/concat/test_datetimes.py | 20 +-
pandas/tests/reshape/concat/test_empty.py | 2 +-
pandas/tests/reshape/concat/test_invalid.py | 6 +-
pandas/tests/reshape/merge/test_merge.py | 52 +--
.../merge/test_merge_index_as_string.py | 20 +-
.../tests/reshape/merge/test_merge_ordered.py | 81 ----
pandas/tests/reshape/merge/test_multi.py | 142 +++---
pandas/tests/reshape/test_crosstab.py | 33 +-
pandas/tests/reshape/test_cut.py | 8 +-
pandas/tests/reshape/test_qcut.py | 8 +-
pandas/tests/scalar/period/test_period.py | 16 -
pandas/tests/series/indexing/test_datetime.py | 2 +-
pandas/tests/series/indexing/test_getitem.py | 20 +-
pandas/tests/series/indexing/test_indexing.py | 11 +-
pandas/tests/series/indexing/test_xs.py | 4 +-
pandas/tests/series/methods/test_convert.py | 76 +++-
.../series/methods/test_convert_dtypes.py | 416 +++++++++++-------
.../tests/series/methods/test_interpolate.py | 80 +---
pandas/tests/series/test_arithmetic.py | 19 +-
pandas/tests/test_algos.py | 3 +
pandas/tests/tools/test_to_numeric.py | 42 +-
pandas/tests/tseries/holiday/test_holiday.py | 16 +-
.../tests/tseries/holiday/test_observance.py | 22 +-
pandas/tests/tseries/offsets/test_offsets.py | 4 +-
pandas/tests/tslibs/test_to_offset.py | 18 +-
.../util/test_assert_categorical_equal.py | 2 +-
.../util/test_assert_extension_array_equal.py | 8 +-
pandas/tests/util/test_assert_frame_equal.py | 18 +-
pandas/tests/util/test_assert_index_equal.py | 6 +-
.../util/test_assert_interval_array_equal.py | 14 +-
pandas/tests/util/test_assert_series_equal.py | 8 +-
pandas/tests/util/test_show_versions.py | 2 +-
pandas/tests/util/test_validate_args.py | 4 +-
pandas/tests/util/test_validate_kwargs.py | 2 +-
.../moments/test_moments_consistency_ewm.py | 1 +
.../test_moments_consistency_rolling.py | 1 +
pandas/tests/window/test_groupby.py | 79 +---
pandas/tests/window/test_rolling.py | 11 +-
pandas/util/_print_versions.py | 2 +-
267 files changed, 3078 insertions(+), 4938 deletions(-)
delete mode 100644 pandas/core/arrays/numeric.py
create mode 100644 pandas/tests/dtypes/cast/test_convert_objects.py
diff --git a/Dockerfile b/Dockerfile
index de1c564921de9..5d7a2b9e6b743 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/condaforge/miniforge3
+FROM continuumio/miniconda3
# if you forked pandas, you can pass in your own GitHub username to use your fork
# i.e. gh_username=myname
@@ -15,6 +15,10 @@ RUN apt-get update \
# Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
&& apt-get -y install git iproute2 procps iproute2 lsb-release \
#
+ # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill),
+ # needed to build pandas C extensions
+ && apt-get -y install build-essential \
+ #
# cleanup
&& apt-get autoremove -y \
&& apt-get clean -y \
@@ -35,14 +39,9 @@ RUN mkdir "$pandas_home" \
# we just update the base/root one from the 'environment.yml' file instead of creating a new one.
#
# Set up environment
-RUN conda install -y mamba
-RUN mamba env update -n base -f "$pandas_home/environment.yml"
+RUN conda env update -n base -f "$pandas_home/environment.yml"
# Build C extensions and pandas
-SHELL ["/bin/bash", "-c"]
-RUN . /opt/conda/etc/profile.d/conda.sh \
- && conda activate base \
- && cd "$pandas_home" \
- && export \
+RUN cd "$pandas_home" \
&& python setup.py build_ext -j 4 \
&& python -m pip install -e .
diff --git a/README.md b/README.md
index 6d1d890c54093..4072faffe3b3a 100644
--- a/README.md
+++ b/README.md
@@ -63,24 +63,24 @@ Here are just a few of the things that pandas does well:
date shifting and lagging
- [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
- [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion
- [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures
- [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine
- [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe
- [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges
- [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced
- [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
- [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging
- [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index
- [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex
- [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files
- [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files
- [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries
- [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables
- [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality
+ [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data
+ [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion
+ [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures
+ [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine
+ [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe
+ [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges
+ [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix
+ [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
+ [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging
+ [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index
+ [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables
+ [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations
+ [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex
+ [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files
+ [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files
+ [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries
+ [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables
+ [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality
## Where to get it
The source code is currently hosted on GitHub at:
@@ -154,7 +154,7 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove
Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata).
## Discussion and Development
-Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
+Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
## Contributing to pandas [](https://www.codetriage.com/pandas-dev/pandas)
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 4fd91c8aafe4b..74e0a3a434cde 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -358,14 +358,6 @@ def time_assign_with_setitem(self):
for i in range(100):
self.df[i] = np.random.randn(self.N)
- def time_assign_list_like_with_setitem(self):
- np.random.seed(1234)
- self.df[list(range(100))] = np.random.randn(self.N, 100)
-
- def time_assign_list_of_columns_concat(self):
- df = DataFrame(np.random.randn(self.N, 100))
- concat([self.df, df], axis=1)
-
class ChainIndexing:
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 5a36cff7908f0..79a33c437ea5c 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -225,20 +225,6 @@ def time_rolling_offset(self, method):
getattr(self.groupby_roll_offset, method)()
-class GroupbyLargeGroups:
- # https://github.com/pandas-dev/pandas/issues/38038
- # specific example where the rolling operation on a larger dataframe
- # is relatively cheap (few but large groups), but creation of
- # MultiIndex of result can be expensive
-
- def setup(self):
- N = 100000
- self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
-
- def time_rolling_multiindex_creation(self):
- self.df.groupby("A").rolling(3).mean()
-
-
class GroupbyEWM:
params = ["cython", "numba"]
diff --git a/ci/deps/azure-38-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml
index f11a3bcb28ab2..274be0361c2e5 100644
--- a/ci/deps/azure-38-numpydev.yaml
+++ b/ci/deps/azure-38-numpydev.yaml
@@ -12,7 +12,7 @@ dependencies:
# pandas dependencies
- pytz
- - pip=20.2
+ - pip
- pip:
- cython==0.29.21 # GH#34014
- "git+git://github.com/dateutil/dateutil.git"
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index 86d495ef2b097..3c5a88333be56 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -147,9 +147,8 @@ Creating a development environment
To test out code changes, you'll need to build pandas from source, which
requires a C/C++ compiler and Python environment. If you're making documentation
-changes, you can skip to :ref:`contributing.documentation` but if you skip
-creating the development environment you won't be able to build the documentation
-locally before pushing your changes.
+changes, you can skip to :ref:`contributing.documentation` but you won't be able
+to build the documentation locally before pushing your changes.
Using a Docker container
~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index cc2937695e80f..8d74c288bf801 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -252,6 +252,7 @@ Combining / comparing / joining / merging
Series.append
Series.compare
+ Series.replace
Series.update
Time Series-related
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 2b324a74fffaf..1bd35131622ab 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1577,21 +1577,19 @@ value will be an iterable object of type ``TextFileReader``:
.. ipython:: python
- with pd.read_csv("tmp.sv", sep="|", chunksize=4) as reader:
- reader
- for chunk in reader:
- print(chunk)
+ reader = pd.read_csv("tmp.sv", sep="|", chunksize=4)
+ reader
-.. versionchanged:: 1.2
+ for chunk in reader:
+ print(chunk)
- ``read_csv/json/sas`` return a context-manager when iterating through a file.
Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
.. ipython:: python
- with pd.read_csv("tmp.sv", sep="|", iterator=True) as reader:
- reader.get_chunk(5)
+ reader = pd.read_csv("tmp.sv", sep="|", iterator=True)
+ reader.get_chunk(5)
.. ipython:: python
:suppress:
@@ -2240,10 +2238,10 @@ For line-delimited json files, pandas can also return an iterator which reads in
df.to_json(orient="records", lines=True)
# reader is an iterator that returns ``chunksize`` lines each iteration
- with pd.read_json(StringIO(jsonl), lines=True, chunksize=1) as reader:
- reader
- for chunk in reader:
- print(chunk)
+ reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
+ reader
+ for chunk in reader:
+ print(chunk)
.. _io.table_schema:
@@ -5473,9 +5471,9 @@ object can be used as an iterator.
.. ipython:: python
- with pd.read_stata("stata.dta", chunksize=3) as reader:
- for df in reader:
- print(df.shape)
+ reader = pd.read_stata("stata.dta", chunksize=3)
+ for df in reader:
+ print(df.shape)
For more fine-grained control, use ``iterator=True`` and specify
``chunksize`` with each call to
@@ -5483,9 +5481,9 @@ For more fine-grained control, use ``iterator=True`` and specify
.. ipython:: python
- with pd.read_stata("stata.dta", iterator=True) as reader:
- chunk1 = reader.read(5)
- chunk2 = reader.read(5)
+ reader = pd.read_stata("stata.dta", iterator=True)
+ chunk1 = reader.read(5)
+ chunk2 = reader.read(5)
Currently the ``index`` is retrieved as a column.
@@ -5597,9 +5595,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time:
pass
- with pd.read_sas("sas_xport.xpt", chunk=100000) as rdr:
- for chunk in rdr:
- do_something(chunk)
+ rdr = pd.read_sas("sas_xport.xpt", chunk=100000)
+ for chunk in rdr:
+ do_something(chunk)
The specification_ for the xport file format is available from the SAS
web site.
diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst
index 7164830392f35..46c4ad4f35fe4 100644
--- a/doc/source/whatsnew/v1.1.5.rst
+++ b/doc/source/whatsnew/v1.1.5.rst
@@ -19,15 +19,10 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`)
- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`)
- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`)
-- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`)
-- Fixed regression in :meth:`DataFrame.unstack` with columns with integer dtype (:issue:`37115`)
- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
-- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
-- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)
-- Fixed regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` where ``None`` was considered a non-NA value (:issue:`38286`)
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 3bd3f1821f525..d45813960d5c2 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -8,15 +8,6 @@ including other versions of pandas.
{{ header }}
-.. warning::
-
- Previously, the default argument ``engine=None`` to ``pd.read_excel``
- would result in using the `xlrd `_ engine in
- many cases. The engine ``xlrd`` is no longer maintained, and is not supported with
- python >= 3.9. If `openpyxl `_ is installed,
- many of these cases will now default to using the ``openpyxl`` engine. See the
- :func:`read_excel` documentation for more details.
-
.. ---------------------------------------------------------------------------
Enhancements
@@ -33,45 +24,27 @@ prevent accidental introduction of duplicate labels, which can affect downstream
By default, duplicates continue to be allowed.
-.. code-block:: ipython
-
- In [1]: pd.Series([1, 2], index=['a', 'a'])
- Out[1]:
- a 1
- a 2
- Length: 2, dtype: int64
-
- In [2]: pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False)
- ...
- DuplicateLabelError: Index has duplicates.
- positions
- label
- a [0, 1]
+.. ipython:: python
-pandas will propagate the ``allows_duplicate_labels`` property through many operations.
+ pd.Series([1, 2], index=['a', 'a'])
-.. code-block:: ipython
+.. ipython:: python
+ :okexcept:
- In [3]: a = (
- ...: pd.Series([1, 2], index=['a', 'b'])
- ...: .set_flags(allows_duplicate_labels=False)
- ...: )
+ pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False)
- In [4]: a
- Out[4]:
- a 1
- b 2
- Length: 2, dtype: int64
+pandas will propagate the ``allows_duplicate_labels`` property through many operations.
- # An operation introducing duplicates
- In [5]: a.reindex(['a', 'b', 'a'])
- ...
- DuplicateLabelError: Index has duplicates.
- positions
- label
- a [0, 2]
+.. ipython:: python
+ :okexcept:
- [1 rows x 1 columns]
+ a = (
+ pd.Series([1, 2], index=['a', 'b'])
+ .set_flags(allows_duplicate_labels=False)
+ )
+ a
+ # An operation introducing duplicates
+ a.reindex(['a', 'b', 'a'])
.. warning::
@@ -205,9 +178,6 @@ Alternatively, you can also use the dtype object:
pd.Series([1.5, None], dtype=pd.Float32Dtype())
-Operations with the existing integer or boolean nullable data types that
-give float results will now also use the nullable floating data types (:issue:`38178`).
-
.. warning::
Experimental: the new floating data types are currently experimental, and their
@@ -303,10 +273,6 @@ Other enhancements
- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
-- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
- nullable dtypes that use ``pd.NA`` as missing value indicator where possible
- for the resulting DataFrame (default is False, and only applicable for
- ``engine="pyarrow"``) (:issue:`31242`)
- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`)
- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`)
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`)
@@ -323,7 +289,6 @@ Other enhancements
- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
-- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`)
.. ---------------------------------------------------------------------------
@@ -521,7 +486,6 @@ Other API changes
- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
- Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
- Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
-- :meth:`CategoricalIndex.append` with an index that contains non-category values will now cast instead of raising ``TypeError`` (:issue:`38098`)
.. ---------------------------------------------------------------------------
@@ -581,7 +545,6 @@ Performance improvements
- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
-- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
.. ---------------------------------------------------------------------------
@@ -596,11 +559,10 @@ Categorical
- Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`)
- Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`)
- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`)
-- Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with timezone-aware ``datetime64`` categories incorrectly dropping the timezone information instead of casting to object dtype (:issue:`38136`)
+-
Datetimelike
^^^^^^^^^^^^
-- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`)
- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`)
- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`)
- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`)
@@ -621,7 +583,6 @@ Datetimelike
- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`)
- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`)
- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`)
-- Bug in :class:`Period` constructor now correctly handles nanoseconds in the ``value`` argument (:issue:`34621` and :issue:`17053`)
Timedelta
^^^^^^^^^
@@ -654,13 +615,11 @@ Numeric
- Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`)
- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`)
-- Bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with mixed dtypes incorrectly raising ``TypeError`` (:issue:`38195`)
Conversion
^^^^^^^^^^
- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`)
-- Bug in :meth:`Series.astype` conversion from ``string`` to ``float`` raised in presence of ``pd.NA`` values (:issue:`37626`)
-
Strings
@@ -692,7 +651,6 @@ Indexing
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`)
- Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`)
- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`)
-- Bug in :meth:`DataFrame.loc` returning empty result when indexer is a slice with negative step size (:issue:`38071`)
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`)
- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`)
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`)
@@ -705,20 +663,15 @@ Indexing
- Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`)
- Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`)
- Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`)
-- Bug in :meth:`Series.at` returning :class:`Series` with one element instead of scalar when index is a :class:`MultiIndex` with one level (:issue:`38053`)
- Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`)
- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`)
- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`)
-- Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`)
-- Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a listlike ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`)
-- Bug in :meth:`Series.loc` and :meth:`Series.iloc` setting all the values of an ``object`` Series with those of a listlike ``ExtensionArray`` instead of inserting it (:issue:`38271`)
Missing
^^^^^^^
- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`)
- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`)
-- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`)
-
MultiIndex
@@ -728,7 +681,6 @@ MultiIndex
- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`)
-- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`33494`)
I/O
^^^
@@ -806,14 +758,10 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`)
- Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`)
-- Bug in :meth:`.DataFrameGroupBy.apply` dropped values on ``nan`` group when returning the same axes with the original frame (:issue:`38227`)
-- Bug in :meth:`.DataFrameGroupBy.quantile` couldn't handle with arraylike ``q`` when grouping by columns (:issue:`33795`)
-- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`)
Reshaping
^^^^^^^^^
-- Bug in :meth:`DataFrame.crosstab` was returning incorrect results on inputs with duplicate row names, duplicate column names or duplicate names between row and column labels (:issue:`22529`)
- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
- Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`)
- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`)
@@ -829,10 +777,6 @@ Reshaping
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`)
-- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`)
-- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`)
-- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`)
-- Bug in :func:`DataFrame.drop_duplicates` not validating bool dtype for ``ignore_index`` keyword (:issue:`38274`)
Sparse
^^^^^^
@@ -848,7 +792,6 @@ ExtensionArray
- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`)
- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`)
- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`)
-- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`)
Other
^^^^^
@@ -860,19 +803,10 @@ Other
- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`)
- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`)
- Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`)
-- Bug in :meth:`Index.intersection` with non-:class:`Index` failing to set the correct name on the returned :class:`Index` (:issue:`38111`)
-- Bug in :meth:`RangeIndex.intersection` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38197`)
-- Bug in :meth:`Index.difference` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38268`)
- Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`)
-- Bug in :meth:`Index.intersection` with non-matching numeric dtypes casting to ``object`` dtype instead of minimal common dtype (:issue:`38122`)
-- Bug in :meth:`IntervalIndex.intersection` returning an incorrectly-typed :class:`Index` when empty (:issue:`38282`)
- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`)
- Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`)
-- Bug in :meth:`Index.drop` raising ``InvalidIndexError`` when index has duplicates (:issue:`38051`)
- Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`)
-- Fixed bug in :func:`assert_series_equal` when comparing a datetime-like array with an equivalent non extension dtype array (:issue:`37609`)
-
-
.. ---------------------------------------------------------------------------
@@ -880,5 +814,3 @@ Other
Contributors
~~~~~~~~~~~~
-
-.. contributors:: v1.1.4..v1.2.0|HEAD
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 5c4ba3b2729e3..24156c88f0d76 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -928,7 +928,9 @@ def group_last(rank_t[:, :] out,
for j in range(K):
val = values[i, j]
- if not checknull(val):
+ # None should not be treated like other NA-like
+ # so that it won't be converted to nan
+ if not checknull(val) or val is None:
# NB: use _treat_as_na here once
# conditional-nogil is available.
nobs[lab, j] += 1
@@ -937,7 +939,7 @@ def group_last(rank_t[:, :] out,
for i in range(ncounts):
for j in range(K):
if nobs[i, j] < min_count:
- out[i, j] = None
+ out[i, j] = NAN
else:
out[i, j] = resx[i, j]
else:
@@ -1021,7 +1023,9 @@ def group_nth(rank_t[:, :] out,
for j in range(K):
val = values[i, j]
- if not checknull(val):
+ # None should not be treated like other NA-like
+ # so that it won't be converted to nan
+ if not checknull(val) or val is None:
# NB: use _treat_as_na here once
# conditional-nogil is available.
nobs[lab, j] += 1
@@ -1031,7 +1035,7 @@ def group_nth(rank_t[:, :] out,
for i in range(ncounts):
for j in range(K):
if nobs[i, j] < min_count:
- out[i, j] = None
+ out[i, j] = NAN
else:
out[i, j] = resx[i, j]
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
index 4b6b71088cb7c..ad6329c588bbe 100644
--- a/pandas/_libs/reduction.pyx
+++ b/pandas/_libs/reduction.pyx
@@ -367,9 +367,9 @@ def apply_frame_axis0(object frame, object f, object names,
try:
piece = f(chunk)
- except Exception as err:
+ except Exception:
# We can't be more specific without knowing something about `f`
- raise InvalidApply("Let this error raise above us") from err
+ raise InvalidApply('Let this error raise above us')
# Need to infer if low level index slider will cause segfaults
require_slow_apply = i == 0 and piece is chunk
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index cbd4e2e6704a9..d83138528a6f9 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -2345,7 +2345,6 @@ class Period(_Period):
if freq is not None:
freq = cls._maybe_convert_freq(freq)
- nanosecond = 0
if ordinal is not None and value is not None:
raise ValueError("Only value or ordinal but not both should be "
@@ -2395,14 +2394,6 @@ class Period(_Period):
value = str(value)
value = value.upper()
dt, reso = parse_time_string(value, freq)
- try:
- ts = Timestamp(value)
- except ValueError:
- nanosecond = 0
- else:
- nanosecond = ts.nanosecond
- if nanosecond != 0:
- reso = 'nanosecond'
if dt is NaT:
ordinal = NPY_NAT
@@ -2434,7 +2425,7 @@ class Period(_Period):
base = freq_to_dtype_code(freq)
ordinal = period_ordinal(dt.year, dt.month, dt.day,
dt.hour, dt.minute, dt.second,
- dt.microsecond, 1000*nanosecond, base)
+ dt.microsecond, 0, base)
return cls._from_ordinal(ordinal, freq)
diff --git a/pandas/_testing.py b/pandas/_testing.py
index 469f5e1bed6ba..68371b782aac2 100644
--- a/pandas/_testing.py
+++ b/pandas/_testing.py
@@ -1456,16 +1456,7 @@ def assert_series_equal(
check_dtype=check_dtype,
index_values=np.asarray(left.index),
)
- elif is_extension_array_dtype_and_needs_i8_conversion(
- left.dtype, right.dtype
- ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype):
- assert_extension_array_equal(
- left._values,
- right._values,
- check_dtype=check_dtype,
- index_values=np.asarray(left.index),
- )
- elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
+ elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
# DatetimeArray or TimedeltaArray
assert_extension_array_equal(
left._values,
@@ -1875,20 +1866,6 @@ def assert_copy(iter1, iter2, **eql_kwargs):
assert elem1 is not elem2, msg
-def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> bool:
- """
- Checks that we have the combination of an ExtensionArraydtype and
- a dtype that should be converted to int64
-
- Returns
- -------
- bool
-
- Related to issue #37609
- """
- return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype)
-
-
def getCols(k):
return string.ascii_uppercase[:k]
@@ -2190,15 +2167,15 @@ def makeCustomIndex(
names = [names]
# specific 1D index type requested?
- idx_func = {
- "i": makeIntIndex,
- "f": makeFloatIndex,
- "s": makeStringIndex,
- "u": makeUnicodeIndex,
- "dt": makeDateIndex,
- "td": makeTimedeltaIndex,
- "p": makePeriodIndex,
- }.get(idx_type)
+ idx_func = dict(
+ i=makeIntIndex,
+ f=makeFloatIndex,
+ s=makeStringIndex,
+ u=makeUnicodeIndex,
+ dt=makeDateIndex,
+ td=makeTimedeltaIndex,
+ p=makePeriodIndex,
+ ).get(idx_type)
if idx_func:
# pandas\_testing.py:2120: error: Cannot call function of unknown type
idx = idx_func(nentries) # type: ignore[operator]
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 09c490e64957d..7f01bcaa1c50e 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -1,7 +1,7 @@
from datetime import datetime, timedelta, tzinfo
from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper
from mmap import mmap
-from os import PathLike
+from pathlib import Path
from typing import (
IO,
TYPE_CHECKING,
@@ -135,7 +135,7 @@
# filenames and file-like-objects
Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
FileOrBuffer = Union[str, Buffer[T]]
-FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[T]]
+FilePathOrBuffer = Union[Path, FileOrBuffer[T]]
# for arbitrary kwargs passed during reading/writing files
StorageOptions = Optional[Dict[str, Any]]
diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
index c47c31fabeb70..c2e91c7877d35 100644
--- a/pandas/compat/numpy/function.py
+++ b/pandas/compat/numpy/function.py
@@ -71,7 +71,7 @@ def __call__(
raise ValueError(f"invalid validation method '{method}'")
-ARGMINMAX_DEFAULTS = {"out": None}
+ARGMINMAX_DEFAULTS = dict(out=None)
validate_argmin = CompatValidator(
ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1
)
@@ -151,7 +151,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs):
return ascending
-CLIP_DEFAULTS: Dict[str, Any] = {"out": None}
+CLIP_DEFAULTS: Dict[str, Any] = dict(out=None)
validate_clip = CompatValidator(
CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3
)
@@ -208,10 +208,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1
)
-LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False}
+LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False)
validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs")
-MINMAX_DEFAULTS = {"axis": None, "out": None, "keepdims": False}
+MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False)
validate_min = CompatValidator(
MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1
)
@@ -219,17 +219,17 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1
)
-RESHAPE_DEFAULTS: Dict[str, str] = {"order": "C"}
+RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C")
validate_reshape = CompatValidator(
RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1
)
-REPEAT_DEFAULTS: Dict[str, Any] = {"axis": None}
+REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None)
validate_repeat = CompatValidator(
REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1
)
-ROUND_DEFAULTS: Dict[str, Any] = {"out": None}
+ROUND_DEFAULTS: Dict[str, Any] = dict(out=None)
validate_round = CompatValidator(
ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1
)
@@ -300,7 +300,7 @@ def validate_take_with_convert(convert, args, kwargs):
return convert
-TRANSPOSE_DEFAULTS = {"axes": None}
+TRANSPOSE_DEFAULTS = dict(axes=None)
validate_transpose = CompatValidator(
TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0
)
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 2bac2ed198789..a0ec6f96042fc 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -288,6 +288,7 @@ def unique_nulls_fixture(request):
# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture
+
# ----------------------------------------------------------------
# Classes
# ----------------------------------------------------------------
@@ -320,16 +321,6 @@ def index_or_series(request):
index_or_series2 = index_or_series
-@pytest.fixture(
- params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"]
-)
-def index_or_series_or_array(request):
- """
- Fixture to parametrize over Index, Series, and ExtensionArray
- """
- return request.param
-
-
@pytest.fixture
def dict_subclass():
"""
@@ -1100,20 +1091,6 @@ def float_ea_dtype(request):
return request.param
-@pytest.fixture(params=tm.FLOAT_DTYPES + tm.FLOAT_EA_DTYPES)
-def any_float_allowed_nullable_dtype(request):
- """
- Parameterized fixture for float dtypes.
-
- * float
- * 'float32'
- * 'float64'
- * 'Float32'
- * 'Float64'
- """
- return request.param
-
-
@pytest.fixture(params=tm.COMPLEX_DTYPES)
def complex_dtype(request):
"""
@@ -1432,17 +1409,3 @@ def __init__(self, **kwargs):
registry.pop("testmem", None)
TestMemoryFS.test[0] = None
TestMemoryFS.store.clear()
-
-
-@pytest.fixture(
- params=[
- ("foo", None, None),
- ("Egon", "Venkman", None),
- ("NCC1701D", "NCC1701D", "NCC1701D"),
- ]
-)
-def names(request):
- """
- A 3-tuple of names, the first two for operands, the last for a result.
- """
- return request.param
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 9749297efd004..7bae912a070a9 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -48,13 +48,11 @@
pandas_dtype,
)
from pandas.core.dtypes.generic import (
- ABCDatetimeArray,
ABCExtensionArray,
ABCIndexClass,
ABCMultiIndex,
ABCRangeIndex,
ABCSeries,
- ABCTimedeltaArray,
)
from pandas.core.dtypes.missing import isna, na_value_for_dtype
@@ -170,7 +168,6 @@ def _ensure_data(
elif is_categorical_dtype(values.dtype) and (
is_categorical_dtype(dtype) or dtype is None
):
- values = cast("Categorical", values)
values = values.codes
dtype = pandas_dtype("category")
@@ -201,16 +198,8 @@ def _reconstruct_data(
-------
ExtensionArray or np.ndarray
"""
- if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
- # Catch DatetimeArray/TimedeltaArray
- return values
-
if is_extension_array_dtype(dtype):
- cls = dtype.construct_array_type()
- if isinstance(values, cls) and values.dtype == dtype:
- return values
-
- values = cls._from_sequence(values)
+ values = dtype.construct_array_type()._from_sequence(values)
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)
@@ -445,8 +434,6 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
elif isinstance(values, ABCMultiIndex):
# Avoid raising in extract_array
values = np.array(values)
- else:
- values = extract_array(values, extract_numpy=True)
comps = _ensure_arraylike(comps)
comps = extract_array(comps, extract_numpy=True)
@@ -461,14 +448,11 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype):
# e.g. comps are integers and values are datetime64s
return np.zeros(comps.shape, dtype=bool)
- # TODO: not quite right ... Sparse/Categorical
- elif needs_i8_conversion(values.dtype):
- return isin(comps, values.astype(object))
- elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype(
- values.dtype
- ):
- return isin(np.asarray(comps), np.asarray(values))
+ comps, dtype = _ensure_data(comps)
+ values, _ = _ensure_data(values, dtype=dtype)
+
+ f = htable.ismember_object
# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
@@ -481,15 +465,23 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
else:
f = np.in1d
-
- else:
- common = np.find_common_type([values.dtype, comps.dtype], [])
- values = values.astype(common, copy=False)
- comps = comps.astype(common, copy=False)
- name = common.name
- if name == "bool":
- name = "uint8"
- f = getattr(htable, f"ismember_{name}")
+ elif is_integer_dtype(comps.dtype):
+ try:
+ values = values.astype("int64", copy=False)
+ comps = comps.astype("int64", copy=False)
+ f = htable.ismember_int64
+ except (TypeError, ValueError, OverflowError):
+ values = values.astype(object)
+ comps = comps.astype(object)
+
+ elif is_float_dtype(comps.dtype):
+ try:
+ values = values.astype("float64", copy=False)
+ comps = comps.astype("float64", copy=False)
+ f = htable.ismember_float64
+ except (TypeError, ValueError):
+ values = values.astype(object)
+ comps = comps.astype(object)
return f(comps, values)
@@ -681,13 +673,8 @@ def factorize(
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.
- if isinstance(values, ABCRangeIndex):
- return values.factorize(sort=sort)
-
values = _ensure_arraylike(values)
original = values
- if not isinstance(values, ABCMultiIndex):
- values = extract_array(values, extract_numpy=True)
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
@@ -696,20 +683,10 @@ def factorize(
na_sentinel = -1
dropna = False
- if (
- isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
- and values.freq is not None
- ):
- codes, uniques = values.factorize(sort=sort)
- if isinstance(original, ABCIndexClass):
- uniques = original._shallow_copy(uniques, name=None)
- elif isinstance(original, ABCSeries):
- from pandas import Index
-
- uniques = Index(uniques)
- return codes, uniques
-
- if is_extension_array_dtype(values.dtype):
+ if isinstance(values, ABCRangeIndex):
+ return values.factorize(sort=sort)
+ elif is_extension_array_dtype(values.dtype):
+ values = extract_array(values)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 6d9e11ecb824f..c5260deafc0c3 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -9,12 +9,7 @@
from pandas._typing import Axis, FrameOrSeriesUnion
from pandas.util._decorators import cache_readonly
-from pandas.core.dtypes.common import (
- is_dict_like,
- is_extension_array_dtype,
- is_list_like,
- is_sequence,
-)
+from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.construction import create_series_with_explicit_dtype
@@ -397,20 +392,12 @@ def series_generator(self):
mgr = ser._mgr
blk = mgr.blocks[0]
- if is_extension_array_dtype(blk.dtype):
- # values will be incorrect for this block
- # TODO(EA2D): special case would be unnecessary with 2D EAs
- obj = self.obj
- for i in range(len(obj)):
- yield obj._ixs(i, axis=0)
-
- else:
- for (arr, name) in zip(values, self.index):
- # GH#35462 re-pin mgr in case setitem changed it
- ser._mgr = mgr
- blk.values = arr
- ser.name = name
- yield ser
+ for (arr, name) in zip(values, self.index):
+ # GH#35462 re-pin mgr in case setitem changed it
+ ser._mgr = mgr
+ blk.values = arr
+ ser.name = name
+ yield ser
@property
def result_index(self) -> "Index":
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 02214ff51b02a..5cc6525dc3c9b 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -162,7 +162,7 @@ def repeat(
--------
numpy.ndarray.repeat
"""
- nv.validate_repeat((), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
new_data = self._ndarray.repeat(repeats, axis=axis)
return self._from_backing_data(new_data)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 77cd603cc6b8d..448025e05422d 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -37,7 +37,6 @@
is_array_like,
is_dtype_equal,
is_list_like,
- is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
@@ -49,7 +48,7 @@
from pandas.core.missing import get_fill_func
from pandas.core.sorting import nargminmax, nargsort
-_extension_array_shared_docs: Dict[str, str] = {}
+_extension_array_shared_docs: Dict[str, str] = dict()
ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray")
@@ -355,23 +354,6 @@ def __iter__(self):
for i in range(len(self)):
yield self[i]
- def __contains__(self, item) -> bool:
- """
- Return for `item in self`.
- """
- # GH37867
- # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA]
- # would raise a TypeError. The implementation below works around that.
- if is_scalar(item) and isna(item):
- if not self._can_hold_na:
- return False
- elif item is self.dtype.na_value or isinstance(item, self.dtype.type):
- return self.isna().any()
- else:
- return False
- else:
- return (item == self).any()
-
def __eq__(self, other: Any) -> ArrayLike:
"""
Return for `self == other` (element-wise equality).
@@ -952,7 +934,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
@Substitution(klass="ExtensionArray")
@Appender(_extension_array_shared_docs["repeat"])
def repeat(self, repeats, axis=None):
- nv.validate_repeat(tuple(), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
ind = np.arange(len(self)).repeat(repeats)
return self.take(ind)
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 44cc108ed9cfd..c6c7396a980b0 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -706,11 +706,10 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
):
- from pandas.core.arrays import FloatingArray
-
- return FloatingArray(result, mask, copy=False)
+ result[mask] = np.nan
+ return result
- elif is_bool_dtype(result):
+ if is_bool_dtype(result):
return BooleanArray(result, mask, copy=False)
elif is_integer_dtype(result):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 3995e7b251184..fe66aae23f510 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1269,13 +1269,15 @@ def __array__(self, dtype=None) -> np.ndarray:
if dtype==None (default), the same dtype as
categorical.categories.dtype.
"""
- ret = take_1d(self.categories._values, self._codes)
+ ret = take_1d(self.categories.values, self._codes)
if dtype and not is_dtype_equal(dtype, self.categories.dtype):
return np.asarray(ret, dtype)
- # When we're a Categorical[ExtensionArray], like Interval,
- # we need to ensure __array__ gets all the way to an
- # ndarray.
- return np.asarray(ret)
+ if is_extension_array_dtype(ret):
+ # When we're a Categorical[ExtensionArray], like Interval,
+ # we need to ensure __array__ get's all the way to an
+ # ndarray.
+ ret = np.asarray(ret)
+ return ret
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 66906f8463336..8fa2c734092f4 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -936,8 +936,7 @@ def _cmp_method(self, other, op):
return result
other_vals = self._unbox(other)
- # GH#37462 comparison on i8 values is almost 2x faster than M8/m8
- result = op(self._ndarray.view("i8"), other_vals.view("i8"))
+ result = op(self._ndarray, other_vals)
o_mask = isna(other)
if self._hasnans | np.any(o_mask):
@@ -1646,24 +1645,6 @@ def _with_freq(self, freq):
arr._freq = freq
return arr
- # --------------------------------------------------------------
-
- def factorize(self, na_sentinel=-1, sort: bool = False):
- if self.freq is not None:
- # We must be unique, so can short-circuit (and retain freq)
- codes = np.arange(len(self), dtype=np.intp)
- uniques = self.copy() # TODO: copy or view?
- if sort and self.freq.n < 0:
- codes = codes[::-1]
- # TODO: overload __getitem__, a slice indexer returns same type as self
- # error: Incompatible types in assignment (expression has type
- # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable
- # has type "TimelikeOps") [assignment]
- uniques = uniques[::-1] # type: ignore[assignment]
- return codes, uniques
- # FIXME: shouldn't get here; we are ignoring sort
- return super().factorize(na_sentinel=na_sentinel)
-
# -------------------------------------------------------------------
# Shared Constructor Helpers
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 1077538f6a21d..4aed39d7edb92 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -13,7 +13,9 @@
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
+ is_float,
is_float_dtype,
+ is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
@@ -26,8 +28,7 @@
from pandas.core.ops import invalid_comparison
from pandas.core.tools.numeric import to_numeric
-from .masked import BaseMaskedDtype
-from .numeric import NumericArray
+from .masked import BaseMaskedArray, BaseMaskedDtype
if TYPE_CHECKING:
import pyarrow
@@ -198,7 +199,7 @@ def coerce_to_array(
return values, mask
-class FloatingArray(NumericArray):
+class FloatingArray(BaseMaskedArray):
"""
Array of floating (optional missing) values.
@@ -385,9 +386,9 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
# coerce
if is_float_dtype(dtype):
# In astype, we consider dtype=float to also mean na_value=np.nan
- kwargs = {"na_value": np.nan}
+ kwargs = dict(na_value=np.nan)
elif is_datetime64_dtype(dtype):
- kwargs = {"na_value": np.datetime64("NaT")}
+ kwargs = dict(na_value=np.datetime64("NaT"))
else:
kwargs = {}
@@ -477,6 +478,71 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
return type(self)(result, mask, copy=False)
+ def _arith_method(self, other, op):
+ from pandas.arrays import IntegerArray
+
+ omask = None
+
+ if getattr(other, "ndim", 0) > 1:
+ raise NotImplementedError("can only perform ops with 1-d structures")
+
+ if isinstance(other, (IntegerArray, FloatingArray)):
+ other, omask = other._data, other._mask
+
+ elif is_list_like(other):
+ other = np.asarray(other)
+ if other.ndim > 1:
+ raise NotImplementedError("can only perform ops with 1-d structures")
+ if len(self) != len(other):
+ raise ValueError("Lengths must match")
+ if not (is_float_dtype(other) or is_integer_dtype(other)):
+ raise TypeError("can only perform ops with numeric values")
+
+ else:
+ if not (is_float(other) or is_integer(other) or other is libmissing.NA):
+ raise TypeError("can only perform ops with numeric values")
+
+ if omask is None:
+ mask = self._mask.copy()
+ if other is libmissing.NA:
+ mask |= True
+ else:
+ mask = self._mask | omask
+
+ if op.__name__ == "pow":
+ # 1 ** x is 1.
+ mask = np.where((self._data == 1) & ~self._mask, False, mask)
+ # x ** 0 is 1.
+ if omask is not None:
+ mask = np.where((other == 0) & ~omask, False, mask)
+ elif other is not libmissing.NA:
+ mask = np.where(other == 0, False, mask)
+
+ elif op.__name__ == "rpow":
+ # 1 ** x is 1.
+ if omask is not None:
+ mask = np.where((other == 1) & ~omask, False, mask)
+ elif other is not libmissing.NA:
+ mask = np.where(other == 1, False, mask)
+ # x ** 0 is 1.
+ mask = np.where((self._data == 0) & ~self._mask, False, mask)
+
+ if other is libmissing.NA:
+ result = np.ones_like(self._data)
+ else:
+ with np.errstate(all="ignore"):
+ result = op(self._data, other)
+
+ # divmod returns a tuple
+ if op.__name__ == "divmod":
+ div, mod = result
+ return (
+ self._maybe_mask_result(div, mask, other, "floordiv"),
+ self._maybe_mask_result(mod, mask, other, "mod"),
+ )
+
+ return self._maybe_mask_result(result, mask, other, op.__name__)
+
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index fa427e94fe08f..2897c18acfb09 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -1,10 +1,11 @@
+from datetime import timedelta
import numbers
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
import warnings
import numpy as np
-from pandas._libs import iNaT, lib, missing as libmissing
+from pandas._libs import Timedelta, iNaT, lib, missing as libmissing
from pandas._typing import ArrayLike, DtypeObj
from pandas.compat.numpy import function as nv
from pandas.util._decorators import cache_readonly
@@ -15,6 +16,7 @@
is_datetime64_dtype,
is_float,
is_float_dtype,
+ is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
@@ -27,7 +29,6 @@
from pandas.core.tools.numeric import to_numeric
from .masked import BaseMaskedArray, BaseMaskedDtype
-from .numeric import NumericArray
if TYPE_CHECKING:
import pyarrow
@@ -262,7 +263,7 @@ def coerce_to_array(
return values, mask
-class IntegerArray(NumericArray):
+class IntegerArray(BaseMaskedArray):
"""
Array of integer (optional missing) values.
@@ -493,7 +494,7 @@ def _values_for_argsort(self) -> np.ndarray:
return data
def _cmp_method(self, other, op):
- from pandas.core.arrays import BooleanArray
+ from pandas.core.arrays import BaseMaskedArray, BooleanArray
mask = None
@@ -537,6 +538,73 @@ def _cmp_method(self, other, op):
return BooleanArray(result, mask)
+ def _arith_method(self, other, op):
+ op_name = op.__name__
+ omask = None
+
+ if getattr(other, "ndim", 0) > 1:
+ raise NotImplementedError("can only perform ops with 1-d structures")
+
+ if isinstance(other, IntegerArray):
+ other, omask = other._data, other._mask
+
+ elif is_list_like(other):
+ other = np.asarray(other)
+ if other.ndim > 1:
+ raise NotImplementedError("can only perform ops with 1-d structures")
+ if len(self) != len(other):
+ raise ValueError("Lengths must match")
+ if not (is_float_dtype(other) or is_integer_dtype(other)):
+ raise TypeError("can only perform ops with numeric values")
+
+ elif isinstance(other, (timedelta, np.timedelta64)):
+ other = Timedelta(other)
+
+ else:
+ if not (is_float(other) or is_integer(other) or other is libmissing.NA):
+ raise TypeError("can only perform ops with numeric values")
+
+ if omask is None:
+ mask = self._mask.copy()
+ if other is libmissing.NA:
+ mask |= True
+ else:
+ mask = self._mask | omask
+
+ if op_name == "pow":
+ # 1 ** x is 1.
+ mask = np.where((self._data == 1) & ~self._mask, False, mask)
+ # x ** 0 is 1.
+ if omask is not None:
+ mask = np.where((other == 0) & ~omask, False, mask)
+ elif other is not libmissing.NA:
+ mask = np.where(other == 0, False, mask)
+
+ elif op_name == "rpow":
+ # 1 ** x is 1.
+ if omask is not None:
+ mask = np.where((other == 1) & ~omask, False, mask)
+ elif other is not libmissing.NA:
+ mask = np.where(other == 1, False, mask)
+ # x ** 0 is 1.
+ mask = np.where((self._data == 0) & ~self._mask, False, mask)
+
+ if other is libmissing.NA:
+ result = np.ones_like(self._data)
+ else:
+ with np.errstate(all="ignore"):
+ result = op(self._data, other)
+
+ # divmod returns a tuple
+ if op_name == "divmod":
+ div, mod = result
+ return (
+ self._maybe_mask_result(div, mask, other, "floordiv"),
+ self._maybe_mask_result(mod, mask, other, "mod"),
+ )
+
+ return self._maybe_mask_result(result, mask, other, op_name)
+
def sum(self, *, skipna=True, min_count=0, **kwargs):
nv.validate_sum((), kwargs)
return super()._reduce("sum", skipna=skipna, min_count=min_count)
@@ -568,9 +636,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
):
- from pandas.core.arrays import FloatingArray
-
- return FloatingArray(result, mask, copy=False)
+ result[mask] = np.nan
+ return result
if result.dtype == "timedelta64[ns]":
from pandas.core.arrays import TimedeltaArray
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 53a98fc43becc..efb66c9a47a97 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -44,11 +44,7 @@
from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs
from pandas.core.arrays.categorical import Categorical
import pandas.core.common as com
-from pandas.core.construction import (
- array,
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
+from pandas.core.construction import array, extract_array
from pandas.core.indexers import check_array_indexer
from pandas.core.indexes.base import ensure_index
from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer
@@ -57,11 +53,9 @@
_interval_shared_docs = {}
-_shared_docs_kwargs = {
- "klass": "IntervalArray",
- "qualname": "arrays.IntervalArray",
- "name": "",
-}
+_shared_docs_kwargs = dict(
+ klass="IntervalArray", qualname="arrays.IntervalArray", name=""
+)
_interval_shared_docs[
@@ -129,14 +123,14 @@
@Appender(
_interval_shared_docs["class"]
- % {
- "klass": "IntervalArray",
- "summary": "Pandas array for interval data that are closed on the same side.",
- "versionadded": "0.24.0",
- "name": "",
- "extra_attributes": "",
- "extra_methods": "",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ summary="Pandas array for interval data that are closed on the same side.",
+ versionadded="0.24.0",
+ name="",
+ extra_attributes="",
+ extra_methods="",
+ examples=textwrap.dedent(
"""\
Examples
--------
@@ -153,7 +147,7 @@
:meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`.
"""
),
- }
+ )
)
class IntervalArray(IntervalMixin, ExtensionArray):
ndim = 1
@@ -257,9 +251,11 @@ def _simple_new(
raise ValueError(msg)
# For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray
- left = ensure_wrapped_if_datetimelike(left)
+ from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array
+
+ left = maybe_upcast_datetimelike_array(left)
left = extract_array(left, extract_numpy=True)
- right = ensure_wrapped_if_datetimelike(right)
+ right = maybe_upcast_datetimelike_array(right)
right = extract_array(right, extract_numpy=True)
lbase = getattr(left, "_ndarray", left).base
@@ -321,9 +317,9 @@ def _from_factorized(cls, values, original):
@classmethod
@Appender(
_interval_shared_docs["from_breaks"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
Examples
--------
@@ -333,7 +329,7 @@ def _from_factorized(cls, values, original):
Length: 3, closed: right, dtype: interval[int64]
"""
),
- }
+ )
)
def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
breaks = maybe_convert_platform_interval(breaks)
@@ -392,9 +388,9 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
@classmethod
@Appender(
_interval_shared_docs["from_arrays"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
>>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3])
@@ -402,7 +398,7 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
Length: 3, closed: right, dtype: interval[int64]
"""
),
- }
+ )
)
def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
left = maybe_convert_platform_interval(left)
@@ -447,9 +443,9 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
@classmethod
@Appender(
_interval_shared_docs["from_tuples"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
Examples
--------
@@ -459,7 +455,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
Length: 2, closed: right, dtype: interval[int64]
"""
),
- }
+ )
)
def from_tuples(cls, data, closed="right", copy=False, dtype=None):
if len(data):
@@ -906,7 +902,7 @@ def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwarg
When `indices` contains negative values other than ``-1``
and `allow_fill` is True.
"""
- nv.validate_take((), kwargs)
+ nv.validate_take(tuple(), kwargs)
fill_left = fill_right = fill_value
if allow_fill:
@@ -1146,9 +1142,9 @@ def mid(self):
@Appender(
_interval_shared_docs["overlaps"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
>>> data = [(0, 1), (1, 3), (2, 4)]
>>> intervals = pd.arrays.IntervalArray.from_tuples(data)
@@ -1158,7 +1154,7 @@ def mid(self):
Length: 3, closed: right, dtype: interval[int64]
"""
),
- }
+ )
)
def overlaps(self, other):
if isinstance(other, (IntervalArray, ABCIntervalIndex)):
@@ -1209,9 +1205,9 @@ def closed(self):
@Appender(
_interval_shared_docs["set_closed"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
Examples
--------
@@ -1226,7 +1222,7 @@ def closed(self):
Length: 3, closed: both, dtype: interval[int64]
"""
),
- }
+ )
)
def set_closed(self, closed):
if closed not in VALID_CLOSED:
@@ -1362,7 +1358,7 @@ def __arrow_array__(self, type=None):
"""
@Appender(
- _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""}
+ _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="")
)
def to_tuples(self, na_tuple=True):
tuples = com.asarray_tuplesafe(zip(self._left, self._right))
@@ -1375,7 +1371,7 @@ def to_tuples(self, na_tuple=True):
@Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs)
def repeat(self, repeats, axis=None):
- nv.validate_repeat((), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
left_repeat = self.left.repeat(repeats)
right_repeat = self.right.repeat(repeats)
return self._shallow_copy(left=left_repeat, right=right_repeat)
@@ -1414,9 +1410,9 @@ def repeat(self, repeats, axis=None):
@Appender(
_interval_shared_docs["contains"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
+ % dict(
+ klass="IntervalArray",
+ examples=textwrap.dedent(
"""\
>>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)])
>>> intervals
@@ -1425,7 +1421,7 @@ def repeat(self, repeats, axis=None):
Length: 3, closed: right, dtype: interval[int64]
"""
),
- }
+ )
)
def contains(self, other):
if isinstance(other, Interval):
diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py
deleted file mode 100644
index 5447a84c86ac1..0000000000000
--- a/pandas/core/arrays/numeric.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import datetime
-
-import numpy as np
-
-from pandas._libs import Timedelta, missing as libmissing
-from pandas.errors import AbstractMethodError
-
-from pandas.core.dtypes.common import (
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
-)
-
-from .masked import BaseMaskedArray
-
-
-class NumericArray(BaseMaskedArray):
- """
- Base class for IntegerArray and FloatingArray.
- """
-
- def _maybe_mask_result(self, result, mask, other, op_name: str):
- raise AbstractMethodError(self)
-
- def _arith_method(self, other, op):
- op_name = op.__name__
- omask = None
-
- if getattr(other, "ndim", 0) > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
-
- if isinstance(other, NumericArray):
- other, omask = other._data, other._mask
-
- elif is_list_like(other):
- other = np.asarray(other)
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
- if len(self) != len(other):
- raise ValueError("Lengths must match")
- if not (is_float_dtype(other) or is_integer_dtype(other)):
- raise TypeError("can only perform ops with numeric values")
-
- elif isinstance(other, (datetime.timedelta, np.timedelta64)):
- other = Timedelta(other)
-
- else:
- if not (is_float(other) or is_integer(other) or other is libmissing.NA):
- raise TypeError("can only perform ops with numeric values")
-
- if omask is None:
- mask = self._mask.copy()
- if other is libmissing.NA:
- mask |= True
- else:
- mask = self._mask | omask
-
- if op_name == "pow":
- # 1 ** x is 1.
- mask = np.where((self._data == 1) & ~self._mask, False, mask)
- # x ** 0 is 1.
- if omask is not None:
- mask = np.where((other == 0) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 0, False, mask)
-
- elif op_name == "rpow":
- # 1 ** x is 1.
- if omask is not None:
- mask = np.where((other == 1) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 1, False, mask)
- # x ** 0 is 1.
- mask = np.where((self._data == 0) & ~self._mask, False, mask)
-
- if other is libmissing.NA:
- result = np.ones_like(self._data)
- else:
- with np.errstate(all="ignore"):
- result = op(self._data, other)
-
- # divmod returns a tuple
- if op_name == "divmod":
- div, mod = result
- return (
- self._maybe_mask_result(div, mask, other, "floordiv"),
- self._maybe_mask_result(mod, mask, other, "mod"),
- )
-
- return self._maybe_mask_result(result, mask, other, op_name)
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 50d12703c3a30..4eb67dcd12728 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -273,12 +273,12 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
# Reductions
def any(self, *, axis=None, out=None, keepdims=False, skipna=True):
- nv.validate_any((), {"out": out, "keepdims": keepdims})
+ nv.validate_any((), dict(out=out, keepdims=keepdims))
result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def all(self, *, axis=None, out=None, keepdims=False, skipna=True):
- nv.validate_all((), {"out": out, "keepdims": keepdims})
+ nv.validate_all((), dict(out=out, keepdims=keepdims))
result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
@@ -311,7 +311,7 @@ def prod(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar:
return self._wrap_reduction_result(axis, result)
def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
- nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
+ nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims))
result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
@@ -319,7 +319,7 @@ def median(
self, *, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True
):
nv.validate_median(
- (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
+ (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims)
)
result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
@@ -328,7 +328,7 @@ def std(
self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True
):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std"
)
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
@@ -337,7 +337,7 @@ def var(
self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True
):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var"
)
result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
@@ -346,21 +346,21 @@ def sem(
self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True
):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem"
)
result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
return self._wrap_reduction_result(axis, result)
def kurt(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt"
)
result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
def skew(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew"
)
result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
return self._wrap_reduction_result(axis, result)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index b8375af797b3a..c591f81390388 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -58,7 +58,7 @@
SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
-_sparray_doc_kwargs = {"klass": "SparseArray"}
+_sparray_doc_kwargs = dict(klass="SparseArray")
def _get_fill(arr: "SparseArray") -> np.ndarray:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index cc2013deb5252..e75305e55348c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -18,8 +18,7 @@
from pandas.core import ops
from pandas.core.array_algos import masked_reductions
-from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray
-from pandas.core.arrays.floating import FloatingDtype
+from pandas.core.arrays import IntegerArray, PandasArray
from pandas.core.arrays.integer import _IntegerDtype
from pandas.core.construction import extract_array
from pandas.core.indexers import check_array_indexer
@@ -295,19 +294,6 @@ def astype(self, dtype, copy=True):
arr[mask] = 0
values = arr.astype(dtype.numpy_dtype)
return IntegerArray(values, mask, copy=False)
- elif isinstance(dtype, FloatingDtype):
- arr = self.copy()
- mask = self.isna()
- arr[mask] = "0"
- values = arr.astype(dtype.numpy_dtype)
- return FloatingArray(values, mask, copy=False)
- elif np.issubdtype(dtype, np.floating):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = 0
- values = arr.astype(dtype)
- values[mask] = np.nan
- return values
return super().astype(dtype, copy)
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 0921c3460c626..998117cc49d50 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -373,7 +373,7 @@ def sum(
min_count: int = 0,
):
nv.validate_sum(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial}
+ (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
)
result = nanops.nansum(
@@ -391,7 +391,7 @@ def std(
skipna: bool = True,
):
nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std"
)
result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
diff --git a/pandas/core/base.py b/pandas/core/base.py
index f333ee0f71e46..5f724d9e89d05 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -46,13 +46,13 @@
if TYPE_CHECKING:
from pandas import Categorical
-_shared_docs: Dict[str, str] = {}
-_indexops_doc_kwargs = {
- "klass": "IndexOpsMixin",
- "inplace": "",
- "unique": "IndexOpsMixin",
- "duplicated": "IndexOpsMixin",
-}
+_shared_docs: Dict[str, str] = dict()
+_indexops_doc_kwargs = dict(
+ klass="IndexOpsMixin",
+ inplace="",
+ unique="IndexOpsMixin",
+ duplicated="IndexOpsMixin",
+)
_T = TypeVar("_T", bound="IndexOpsMixin")
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index b819886687817..0498d4d171c00 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -35,7 +35,7 @@ def __init__(
queryables: Optional[Dict[str, Any]] = None,
):
super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
- self.queryables = queryables or {}
+ self.queryables = queryables or dict()
class Term(ops.Term):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 96cf1be7520fb..f9ebe3f1e185e 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -402,24 +402,6 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL
return obj
-def ensure_wrapped_if_datetimelike(arr):
- """
- Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
- """
- if isinstance(arr, np.ndarray):
- if arr.dtype.kind == "M":
- from pandas.core.arrays import DatetimeArray
-
- return DatetimeArray._from_sequence(arr)
-
- elif arr.dtype.kind == "m":
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray._from_sequence(arr)
-
- return arr
-
-
def sanitize_array(
data,
index: Optional[Index],
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 12974d56dacdc..0f0e82f4ad4e2 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -99,6 +99,7 @@
from pandas import Series
from pandas.core.arrays import ExtensionArray
from pandas.core.indexes.base import Index
+ from pandas.core.indexes.datetimes import DatetimeIndex
_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
@@ -296,9 +297,7 @@ def trans(x):
return result
-def maybe_cast_result(
- result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = ""
-) -> ArrayLike:
+def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""):
"""
Try casting result to a different type if appropriate
@@ -318,23 +317,25 @@ def maybe_cast_result(
result : array-like
result maybe casted to the dtype.
"""
- dtype = obj.dtype
+ if obj.ndim > 1:
+ dtype = obj._values.dtype
+ else:
+ dtype = obj.dtype
dtype = maybe_cast_result_dtype(dtype, how)
- assert not is_scalar(result)
-
- if (
- is_extension_array_dtype(dtype)
- and not is_categorical_dtype(dtype)
- and dtype.kind != "M"
- ):
- # We have to special case categorical so as not to upcast
- # things like counts back to categorical
- cls = dtype.construct_array_type()
- result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
+ if not is_scalar(result):
+ if (
+ is_extension_array_dtype(dtype)
+ and not is_categorical_dtype(dtype)
+ and dtype.kind != "M"
+ ):
+ # We have to special case categorical so as not to upcast
+ # things like counts back to categorical
+ cls = dtype.construct_array_type()
+ result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
- elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
- result = maybe_downcast_to_dtype(result, dtype)
+ elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
+ result = maybe_downcast_to_dtype(result, dtype)
return result
@@ -451,9 +452,12 @@ def maybe_upcast_putmask(
# NaN -> NaT
# integer or integer array -> date-like array
if result.dtype.kind in ["m", "M"]:
- if isna(other):
- other = result.dtype.type("nat")
- elif is_integer(other):
+ if is_scalar(other):
+ if isna(other):
+ other = result.dtype.type("nat")
+ elif is_integer(other):
+ other = np.array(other, dtype=result.dtype)
+ elif is_integer_dtype(other):
other = np.array(other, dtype=result.dtype)
def changeit():
@@ -506,8 +510,9 @@ def maybe_casted_values(
"""
values = index._values
- if values.dtype == np.object_:
- values = lib.maybe_convert_objects(values)
+ if not isinstance(index, (ABCPeriodIndex, ABCDatetimeIndex)):
+ if values.dtype == np.object_:
+ values = lib.maybe_convert_objects(values)
# if we have the codes, extract the values with a mask
if codes is not None:
@@ -1123,37 +1128,101 @@ def astype_nansafe(
return arr.view(dtype)
-def soft_convert_objects(
- values: np.ndarray,
- datetime: bool = True,
- numeric: bool = True,
- timedelta: bool = True,
- copy: bool = True,
-):
+def maybe_convert_objects(
+ values: np.ndarray, convert_numeric: bool = True
+) -> Union[np.ndarray, "DatetimeIndex"]:
"""
- Try to coerce datetime, timedelta, and numeric object-dtype columns
- to inferred dtype.
+ If we have an object dtype array, try to coerce dates and/or numbers.
Parameters
----------
- values : np.ndarray[object]
- datetime : bool, default True
- numeric: bool, default True
- timedelta : bool, default True
- copy : bool, default True
+ values : ndarray
+ convert_numeric : bool, default True
Returns
-------
- np.ndarray
+ ndarray or DatetimeIndex
"""
+ validate_bool_kwarg(convert_numeric, "convert_numeric")
+
+ orig_values = values
+
+ # convert dates
+ if is_object_dtype(values.dtype):
+ values = lib.maybe_convert_objects(values, convert_datetime=True)
+
+ # convert timedeltas
+ if is_object_dtype(values.dtype):
+ values = lib.maybe_convert_objects(values, convert_timedelta=True)
+
+ # convert to numeric
+ if is_object_dtype(values.dtype):
+ if convert_numeric:
+ try:
+ new_values = lib.maybe_convert_numeric(
+ values, set(), coerce_numeric=True
+ )
+ except (ValueError, TypeError):
+ pass
+ else:
+ # if we are all nans then leave me alone
+ if not isna(new_values).all():
+ values = new_values
+
+ else:
+ # soft-conversion
+ values = lib.maybe_convert_objects(values)
+
+ if values is orig_values:
+ values = values.copy()
+
+ return values
+
+
+def soft_convert_objects(
+ values: np.ndarray,
+ datetime: bool = True,
+ numeric: bool = True,
+ timedelta: bool = True,
+ coerce: bool = False,
+ copy: bool = True,
+):
+ """ if we have an object dtype, try to coerce dates and/or numbers """
validate_bool_kwarg(datetime, "datetime")
validate_bool_kwarg(numeric, "numeric")
validate_bool_kwarg(timedelta, "timedelta")
+ validate_bool_kwarg(coerce, "coerce")
validate_bool_kwarg(copy, "copy")
conversion_count = sum((datetime, numeric, timedelta))
if conversion_count == 0:
raise ValueError("At least one of datetime, numeric or timedelta must be True.")
+ elif conversion_count > 1 and coerce:
+ raise ValueError(
+ "Only one of 'datetime', 'numeric' or "
+ "'timedelta' can be True when coerce=True."
+ )
+
+ if not is_object_dtype(values.dtype):
+ # If not object, do not attempt conversion
+ values = values.copy() if copy else values
+ return values
+
+ # If 1 flag is coerce, ensure 2 others are False
+ if coerce:
+ # Immediate return if coerce
+ if datetime:
+ from pandas import to_datetime
+
+ return to_datetime(values, errors="coerce").to_numpy()
+ elif timedelta:
+ from pandas import to_timedelta
+
+ return to_timedelta(values, errors="coerce").to_numpy()
+ elif numeric:
+ from pandas import to_numeric
+
+ return to_numeric(values, errors="coerce")
# Soft conversions
if datetime:
@@ -1186,7 +1255,6 @@ def convert_dtypes(
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
- convert_floating: bool = True,
) -> Dtype:
"""
Convert objects to best possible type, and optionally,
@@ -1201,10 +1269,6 @@ def convert_dtypes(
Whether, if possible, conversion can be done to integer extension types.
convert_boolean : bool, defaults True
Whether object dtypes should be converted to ``BooleanDtypes()``.
- convert_floating : bool, defaults True
- Whether, if possible, conversion can be done to floating extension types.
- If `convert_integer` is also True, preference will be give to integer
- dtypes if the floats can be faithfully casted to integers.
Returns
-------
@@ -1212,9 +1276,7 @@ def convert_dtypes(
new dtype
"""
is_extension = is_extension_array_dtype(input_array.dtype)
- if (
- convert_string or convert_integer or convert_boolean or convert_floating
- ) and not is_extension:
+ if (convert_string or convert_integer or convert_boolean) and not is_extension:
try:
inferred_dtype = lib.infer_dtype(input_array)
except ValueError:
@@ -1242,29 +1304,6 @@ def convert_dtypes(
if is_integer_dtype(inferred_dtype):
inferred_dtype = input_array.dtype
- if convert_floating:
- if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
- input_array.dtype
- ):
- from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
-
- inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
- input_array.dtype.name, "Float64"
- )
- # if we could also convert to integer, check if all floats
- # are actually integers
- if convert_integer:
- arr = input_array[notna(input_array)]
- if (arr.astype(int) == arr).all():
- inferred_dtype = "Int64"
- else:
- inferred_dtype = inferred_float_dtype
- else:
- inferred_dtype = inferred_float_dtype
- else:
- if is_float_dtype(inferred_dtype):
- inferred_dtype = input_array.dtype
-
if convert_boolean:
if is_bool_dtype(input_array.dtype):
inferred_dtype = "boolean"
@@ -1318,6 +1357,9 @@ def maybe_infer_to_datetimelike(
value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray)
):
return value
+ elif isinstance(value, ABCSeries):
+ if isinstance(value._values, ABCDatetimeIndex):
+ return value._values
v = value
@@ -1409,6 +1451,9 @@ def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"):
from pandas.core.tools.timedeltas import to_timedelta
if dtype is not None:
+ if isinstance(dtype, str):
+ dtype = np.dtype(dtype)
+
is_datetime64 = is_datetime64_dtype(dtype)
is_datetime64tz = is_datetime64tz_dtype(dtype)
is_timedelta64 = is_timedelta64_dtype(dtype)
@@ -1421,21 +1466,18 @@ def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"):
f"Please pass in '{dtype.name}[ns]' instead."
)
- if is_datetime64:
- # unpack e.g. SparseDtype
- dtype = getattr(dtype, "subtype", dtype)
- if not is_dtype_equal(dtype, DT64NS_DTYPE):
-
- # pandas supports dtype whose granularity is less than [ns]
- # e.g., [ps], [fs], [as]
- if dtype <= np.dtype("M8[ns]"):
- if dtype.name == "datetime64":
- raise ValueError(msg)
- dtype = DT64NS_DTYPE
- else:
- raise TypeError(
- f"cannot convert datetimelike to dtype [{dtype}]"
- )
+ if is_datetime64 and not is_dtype_equal(
+ getattr(dtype, "subtype", dtype), DT64NS_DTYPE
+ ):
+
+ # pandas supports dtype whose granularity is less than [ns]
+ # e.g., [ps], [fs], [as]
+ if dtype <= np.dtype("M8[ns]"):
+ if dtype.name == "datetime64":
+ raise ValueError(msg)
+ dtype = DT64NS_DTYPE
+ else:
+ raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]")
elif is_datetime64tz:
# our NaT doesn't support tz's
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index a9355e30cd3c2..a9b0498081511 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -18,7 +18,7 @@
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseArray
-from pandas.core.construction import array, ensure_wrapped_if_datetimelike
+from pandas.core.construction import array
def _get_dtype_kinds(arrays) -> Set[str]:
@@ -152,7 +152,7 @@ def is_nonempty(x) -> bool:
return np.concatenate(to_concat)
elif _contains_datetime or "timedelta" in typs:
- return _concat_datetime(to_concat, axis=axis)
+ return _concat_datetime(to_concat, axis=axis, typs=typs)
elif all_empty:
# we have all empties, but may need to coerce the result dtype to
@@ -346,7 +346,7 @@ def _concatenate_2d(to_concat, axis: int):
return np.concatenate(to_concat, axis=axis)
-def _concat_datetime(to_concat, axis=0):
+def _concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetime64[ns, tz] or m8[ns] dtype
@@ -355,19 +355,21 @@ def _concat_datetime(to_concat, axis=0):
----------
to_concat : array of arrays
axis : axis to provide concatenation
+ typs : set of to_concat dtypes
Returns
-------
a single array, preserving the combined dtypes
"""
- to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
+ if typs is None:
+ typs = _get_dtype_kinds(to_concat)
+ to_concat = [_wrap_datetimelike(x) for x in to_concat]
single_dtype = len({x.dtype for x in to_concat}) == 1
# multiple types, need to coerce to object
if not single_dtype:
- # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
- # in Timestamp/Timedelta
+ # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
if axis == 1:
@@ -381,3 +383,17 @@ def _concat_datetime(to_concat, axis=0):
assert result.shape[0] == 1
result = result[0]
return result
+
+
+def _wrap_datetimelike(arr):
+ """
+ Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
+
+ DTA/TDA handle .astype(object) correctly.
+ """
+ from pandas.core.construction import array as pd_array, extract_array
+
+ arr = extract_array(arr, extract_numpy=True)
+ if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
+ arr = pd_array(arr)
+ return arr
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 136c8032094b1..07280702cf06f 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -399,14 +399,10 @@ def __eq__(self, other: Any) -> bool:
def __repr__(self) -> str_type:
if self.categories is None:
- data = "None"
+ data = "None, "
else:
data = self.categories._format_data(name=type(self).__name__)
- if data is None:
- # self.categories is RangeIndex
- data = str(self.categories._range)
- data = data.rstrip(", ")
- return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
+ return f"CategoricalDtype(categories={data}ordered={self.ordered})"
@staticmethod
def _hash_categories(categories, ordered: Ordered = True) -> int:
diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py
index dfbbaa9c1784a..0e5867809fe52 100644
--- a/pandas/core/dtypes/generic.py
+++ b/pandas/core/dtypes/generic.py
@@ -4,20 +4,7 @@
from typing import TYPE_CHECKING, Type, cast
if TYPE_CHECKING:
- from pandas import (
- CategoricalIndex,
- DataFrame,
- DatetimeIndex,
- Float64Index,
- Int64Index,
- IntervalIndex,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- Series,
- TimedeltaIndex,
- UInt64Index,
- )
+ from pandas import DataFrame, Series
from pandas.core.generic import NDFrame
@@ -31,50 +18,28 @@ def create_pandas_abc_type(name, attr, comp):
def _check(cls, inst) -> bool:
return getattr(inst, attr, "_typ") in comp
- dct = {"__instancecheck__": _check, "__subclasscheck__": _check}
+ dct = dict(__instancecheck__=_check, __subclasscheck__=_check)
meta = type("ABCBase", (type,), dct)
- return meta(name, (), dct)
+ return meta(name, tuple(), dct)
-ABCInt64Index = cast(
- "Type[Int64Index]",
- create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)),
-)
-ABCUInt64Index = cast(
- "Type[UInt64Index]",
- create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)),
-)
-ABCRangeIndex = cast(
- "Type[RangeIndex]",
- create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)),
-)
-ABCFloat64Index = cast(
- "Type[Float64Index]",
- create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)),
-)
-ABCMultiIndex = cast(
- "Type[MultiIndex]",
- create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)),
-)
-ABCDatetimeIndex = cast(
- "Type[DatetimeIndex]",
- create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)),
-)
-ABCTimedeltaIndex = cast(
- "Type[TimedeltaIndex]",
- create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)),
-)
-ABCPeriodIndex = cast(
- "Type[PeriodIndex]",
- create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)),
-)
-ABCCategoricalIndex = cast(
- "Type[CategoricalIndex]",
- create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)),
-)
-ABCIntervalIndex = cast(
- "Type[IntervalIndex]",
- create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)),
+ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",))
+ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",))
+ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",))
+ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",))
+ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",))
+ABCDatetimeIndex = create_pandas_abc_type(
+ "ABCDatetimeIndex", "_typ", ("datetimeindex",)
+)
+ABCTimedeltaIndex = create_pandas_abc_type(
+ "ABCTimedeltaIndex", "_typ", ("timedeltaindex",)
+)
+ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",))
+ABCCategoricalIndex = create_pandas_abc_type(
+ "ABCCategoricalIndex", "_typ", ("categoricalindex",)
+)
+ABCIntervalIndex = create_pandas_abc_type(
+ "ABCIntervalIndex", "_typ", ("intervalindex",)
)
ABCIndexClass = create_pandas_abc_type(
"ABCIndexClass",
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f710660d6ad8e..c9030a0b2423a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -172,14 +172,14 @@
# ---------------------------------------------------------------------
# Docstring templates
-_shared_doc_kwargs = {
- "axes": "index, columns",
- "klass": "DataFrame",
- "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
- "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
+_shared_doc_kwargs = dict(
+ axes="index, columns",
+ klass="DataFrame",
+ axes_single_arg="{0 or 'index', 1 or 'columns'}",
+ axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
If 0 or 'index': apply function to each column.
If 1 or 'columns': apply function to each row.""",
- "optional_by": """
+ optional_by="""
by : str or list of str
Name or list of names to sort by.
@@ -187,12 +187,12 @@
levels and/or column labels.
- if `axis` is 1 or `'columns'` then `by` may contain column
levels and/or index labels.""",
- "optional_labels": """labels : array-like, optional
+ optional_labels="""labels : array-like, optional
New labels / index to conform the axis specified by 'axis' to.""",
- "optional_axis": """axis : int or str, optional
+ optional_axis="""axis : int or str, optional
Axis to target. Can be either the axis name ('index', 'columns')
or number (0, 1).""",
-}
+)
_numeric_only_doc = """numeric_only : boolean, default None
Include only float, int, boolean data. If None, will attempt to use
@@ -524,7 +524,7 @@ def __init__(
return
mgr = self._init_mgr(
- data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
+ data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
)
elif isinstance(data, dict):
@@ -2902,7 +2902,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
1 object
dtype: object
"""
- nv.validate_transpose(args, {})
+ nv.validate_transpose(args, dict())
# construct the args
dtypes = list(self.dtypes)
@@ -5273,7 +5273,6 @@ def drop_duplicates(
return self.copy()
inplace = validate_bool_kwarg(inplace, "inplace")
- ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
duplicated = self.duplicated(subset, keep=keep)
result = self[-duplicated]
@@ -6387,7 +6386,7 @@ def combine(
otherSeries = otherSeries.astype(new_dtype)
arr = func(series, otherSeries)
- arr = maybe_downcast_to_dtype(arr, new_dtype)
+ arr = maybe_downcast_to_dtype(arr, this_dtype)
result[col] = arr
@@ -7359,7 +7358,7 @@ def unstack(self, level=-1, fill_value=None):
return result.__finalize__(self, method="unstack")
- @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
+ @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt"))
def melt(
self,
id_vars=None,
@@ -9002,11 +9001,7 @@ def idxmin(self, axis=0, skipna=True) -> Series:
dtype: object
"""
axis = self._get_axis_number(axis)
-
- res = self._reduce(
- nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
+ indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
# indices will always be np.ndarray since axis is not None and
# values is a 2d array for DataFrame
@@ -9079,11 +9074,7 @@ def idxmax(self, axis=0, skipna=True) -> Series:
dtype: object
"""
axis = self._get_axis_number(axis)
-
- res = self._reduce(
- nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
+ indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
# indices will always be np.ndarray since axis is not None and
# values is a 2d array for DataFrame
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 808981debf1fe..e12053b71a815 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -128,15 +128,15 @@
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
-_shared_doc_kwargs = {
- "axes": "keywords for axes",
- "klass": "Series/DataFrame",
- "axes_single_arg": "int or labels for object",
- "args_transpose": "axes to permute (int or label for object)",
- "optional_by": """
+_shared_doc_kwargs = dict(
+ axes="keywords for axes",
+ klass="Series/DataFrame",
+ axes_single_arg="int or labels for object",
+ args_transpose="axes to permute (int or label for object)",
+ optional_by="""
by : str or list of str
Name or list of names to sort by""",
-}
+)
bool_t = bool # Need alias because NDFrame has def bool:
@@ -484,7 +484,7 @@ def _get_block_manager_axis(cls, axis: Axis) -> int:
def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]]:
# index or columns
axis_index = getattr(self, axis)
- d = {}
+ d = dict()
prefix = axis[0]
for i, name in enumerate(axis_index.names):
@@ -1946,14 +1946,14 @@ def __array_ufunc__(
@final
def __getstate__(self) -> Dict[str, Any]:
meta = {k: getattr(self, k, None) for k in self._metadata}
- return {
- "_mgr": self._mgr,
- "_typ": self._typ,
- "_metadata": self._metadata,
- "attrs": self.attrs,
- "_flags": {k: self.flags[k] for k in self.flags._keys},
+ return dict(
+ _mgr=self._mgr,
+ _typ=self._typ,
+ _metadata=self._metadata,
+ attrs=self.attrs,
+ _flags={k: self.flags[k] for k in self.flags._keys},
**meta,
- }
+ )
@final
def __setstate__(self, state):
@@ -1967,7 +1967,7 @@ def __setstate__(self, state):
if typ is not None:
attrs = state.get("_attrs", {})
object.__setattr__(self, "_attrs", attrs)
- flags = state.get("_flags", {"allows_duplicate_labels": True})
+ flags = state.get("_flags", dict(allows_duplicate_labels=True))
object.__setattr__(self, "_flags", Flags(self, **flags))
# set in the order of internal names
@@ -2799,13 +2799,6 @@ def to_pickle(
default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
- Compression mode may be any of the following possible
- values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression
- mode is ‘infer’ and path_or_buf is path-like, then detect
- compression mode from the following extensions:
- ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression).
- If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries
- passed as additional compression options.
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
@@ -6004,6 +5997,7 @@ def _convert(
datetime: bool_t = False,
numeric: bool_t = False,
timedelta: bool_t = False,
+ coerce: bool_t = False,
) -> FrameOrSeries:
"""
Attempt to infer better dtype for object columns
@@ -6017,6 +6011,9 @@ def _convert(
unconvertible values becoming NaN.
timedelta : bool, default False
If True, convert to timedelta where possible.
+ coerce : bool, default False
+ If True, force conversion with unconvertible values converted to
+ nulls (NaN or NaT).
Returns
-------
@@ -6025,11 +6022,13 @@ def _convert(
validate_bool_kwarg(datetime, "datetime")
validate_bool_kwarg(numeric, "numeric")
validate_bool_kwarg(timedelta, "timedelta")
+ validate_bool_kwarg(coerce, "coerce")
return self._constructor(
self._mgr.convert(
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
+ coerce=coerce,
copy=True,
)
).__finalize__(self)
@@ -6077,7 +6076,9 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
# python objects will still be converted to
# native numpy numeric types
return self._constructor(
- self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True)
+ self._mgr.convert(
+ datetime=True, numeric=False, timedelta=True, coerce=False, copy=True
+ )
).__finalize__(self, method="infer_objects")
@final
@@ -6087,7 +6088,6 @@ def convert_dtypes(
convert_string: bool_t = True,
convert_integer: bool_t = True,
convert_boolean: bool_t = True,
- convert_floating: bool_t = True,
) -> FrameOrSeries:
"""
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
@@ -6104,12 +6104,6 @@ def convert_dtypes(
Whether, if possible, conversion can be done to integer extension types.
convert_boolean : bool, defaults True
Whether object dtypes should be converted to ``BooleanDtypes()``.
- convert_floating : bool, defaults True
- Whether, if possible, conversion can be done to floating extension types.
- If `convert_integer` is also True, preference will be give to integer
- dtypes if the floats can be faithfully casted to integers.
-
- .. versionadded:: 1.2.0
Returns
-------
@@ -6127,25 +6121,19 @@ def convert_dtypes(
-----
By default, ``convert_dtypes`` will attempt to convert a Series (or each
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
- ``convert_string``, ``convert_integer``, ``convert_boolean`` and
- ``convert_boolean``, it is possible to turn off individual conversions
- to ``StringDtype``, the integer extension types, ``BooleanDtype``
- or floating extension types, respectively.
+ ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
+ possible to turn off individual conversions to ``StringDtype``, the integer
+ extension types or ``BooleanDtype``, respectively.
For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
rules as during normal Series/DataFrame construction. Then, if possible,
- convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
- or floating extension type, otherwise leave as ``object``.
+ convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
+ type, otherwise leave as ``object``.
If the dtype is integer, convert to an appropriate integer extension type.
If the dtype is numeric, and consists of all integers, convert to an
- appropriate integer extension type. Otherwise, convert to an
- appropriate floating extension type.
-
- .. versionchanged:: 1.2
- Starting with pandas 1.2, this method also converts float columns
- to the nullable floating extension type.
+ appropriate integer extension type.
In the future, as new dtypes are added that support ``pd.NA``, the results
of this method will change to support those new dtypes.
@@ -6185,7 +6173,7 @@ def convert_dtypes(
>>> dfn = df.convert_dtypes()
>>> dfn
a b c d e f
- 0 1 x True h 10
+ 0 1 x True h 10 NaN
1 2 y False i 100.5
2 3 z 20 200.0
@@ -6195,7 +6183,7 @@ def convert_dtypes(
c boolean
d string
e Int64
- f Float64
+ f float64
dtype: object
Start with a Series of strings and missing data represented by ``np.nan``.
@@ -6217,20 +6205,12 @@ def convert_dtypes(
"""
if self.ndim == 1:
return self._convert_dtypes(
- infer_objects,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
+ infer_objects, convert_string, convert_integer, convert_boolean
)
else:
results = [
col._convert_dtypes(
- infer_objects,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
+ infer_objects, convert_string, convert_integer, convert_boolean
)
for col_name, col in self.items()
]
@@ -7429,7 +7409,7 @@ def isna(self: FrameOrSeries) -> FrameOrSeries:
>>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
... born=[pd.NaT, pd.Timestamp('1939-05-27'),
- ... pd.Timestamp('1940-04-25')],
+ ... pd.Timestamp('1940-04-25')],
... name=['Alfred', 'Batman', ''],
... toy=[None, 'Batmobile', 'Joker']))
>>> df
@@ -7496,7 +7476,7 @@ def notna(self: FrameOrSeries) -> FrameOrSeries:
>>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
... born=[pd.NaT, pd.Timestamp('1939-05-27'),
- ... pd.Timestamp('1940-04-25')],
+ ... pd.Timestamp('1940-04-25')],
... name=['Alfred', 'Batman', ''],
... toy=[None, 'Batmobile', 'Joker']))
>>> df
@@ -8234,8 +8214,8 @@ def resample(
For DataFrame objects, the keyword `on` can be used to specify the
column instead of the index for resampling.
- >>> d = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
- ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
+ >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
+ ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
>>> df = pd.DataFrame(d)
>>> df['week_starting'] = pd.date_range('01/01/2018',
... periods=8,
@@ -8260,8 +8240,8 @@ def resample(
specify on which level the resampling needs to take place.
>>> days = pd.date_range('1/1/2000', periods=4, freq='D')
- >>> d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
- ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
+ >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
+ ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
>>> df2 = pd.DataFrame(d2,
... index=pd.MultiIndex.from_product([days,
... ['morning',
@@ -10531,10 +10511,10 @@ def pct_change(
Percentage change in French franc, Deutsche Mark, and Italian lira from
1980-01-01 to 1980-03-01.
- >>> df = pd.DataFrame({
- ... 'FR': [4.0405, 4.0963, 4.3149],
- ... 'GR': [1.7246, 1.7482, 1.8519],
- ... 'IT': [804.74, 810.01, 860.13]},
+ >>> df = pd.DataFrame(dict(
+ ... FR=[4.0405, 4.0963, 4.3149],
+ ... GR=[1.7246, 1.7482, 1.8519],
+ ... IT=[804.74, 810.01, 860.13]),
... index=['1980-01-01', '1980-02-01', '1980-03-01'])
>>> df
FR GR IT
@@ -10551,10 +10531,10 @@ def pct_change(
Percentage of change in GOOG and APPL stock volume. Shows computing
the percentage change between columns.
- >>> df = pd.DataFrame({
- ... '2016': [1769950, 30586265],
- ... '2015': [1500923, 40912316],
- ... '2014': [1371819, 41403351]},
+ >>> df = pd.DataFrame(dict([
+ ... ('2016', [1769950, 30586265]),
+ ... ('2015', [1500923, 40912316]),
+ ... ('2014', [1371819, 41403351])]),
... index=['GOOG', 'APPL'])
>>> df
2016 2015 2014
diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
index 99426c55da29b..7dc0db35bf8fe 100644
--- a/pandas/core/groupby/base.py
+++ b/pandas/core/groupby/base.py
@@ -6,8 +6,6 @@
import collections
from typing import List
-from pandas._typing import final
-
from pandas.core.dtypes.common import is_list_like, is_scalar
from pandas.core.base import PandasObject
@@ -18,7 +16,6 @@
class ShallowMixin(PandasObject):
_attributes: List[str] = []
- @final
def _shallow_copy(self, obj, **kwargs):
"""
return a new object with the replacement attributes
@@ -38,7 +35,6 @@ class GotItemMixin(PandasObject):
_attributes: List[str]
- @final
def _gotitem(self, key, ndim, subset=None):
"""
Sub-classes to define. Return a sliced object.
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 07ffb881495fa..244c47cd1f1ea 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -35,7 +35,9 @@
from pandas.core.dtypes.cast import (
find_common_type,
+ maybe_cast_result,
maybe_cast_result_dtype,
+ maybe_convert_objects,
maybe_downcast_numeric,
)
from pandas.core.dtypes.common import (
@@ -45,12 +47,12 @@
is_integer_dtype,
is_interval_dtype,
is_numeric_dtype,
+ is_object_dtype,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.missing import isna, notna
-from pandas.core import algorithms, nanops
from pandas.core.aggregation import (
agg_list_like,
aggregate,
@@ -58,12 +60,13 @@
reconstruct_func,
validate_func_kwargs,
)
-from pandas.core.arrays import Categorical, ExtensionArray
+import pandas.core.algorithms as algorithms
+from pandas.core.arrays import ExtensionArray
from pandas.core.base import DataError, SpecificationError
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.frame import DataFrame
-from pandas.core.generic import NDFrame
+from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame
from pandas.core.groupby import base
from pandas.core.groupby.groupby import (
GroupBy,
@@ -531,7 +534,7 @@ def _transform_general(self, func, *args, **kwargs):
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
- if isinstance(res, (DataFrame, Series)):
+ if isinstance(res, (ABCDataFrame, ABCSeries)):
res = res._values
results.append(klass(res, index=group.index))
@@ -1026,69 +1029,43 @@ def _cython_agg_blocks(
if numeric_only:
data = data.get_numeric_data(copy=False)
+ no_result = object()
+
def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike:
# see if we can cast the values to the desired dtype
# this may not be the original dtype
assert not isinstance(result, DataFrame)
+ assert result is not no_result
dtype = maybe_cast_result_dtype(values.dtype, how)
result = maybe_downcast_numeric(result, dtype)
- if isinstance(values, Categorical) and isinstance(result, np.ndarray):
- # If the Categorical op didn't raise, it is dtype-preserving
- result = type(values)._from_sequence(result.ravel(), dtype=values.dtype)
- # Note this will have result.dtype == dtype from above
+ if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray):
+ # e.g. values was an IntegerArray
+ # (1, N) case can occur if values was Categorical
+ # and result is ndarray[object]
+ # TODO(EA2D): special casing not needed with 2D EAs
+ assert result.ndim == 1 or result.shape[0] == 1
+ try:
+ # Cast back if feasible
+ result = type(values)._from_sequence(
+ result.ravel(), dtype=values.dtype
+ )
+ except (ValueError, TypeError):
+ # reshape to be valid for non-Extension Block
+ result = result.reshape(1, -1)
elif isinstance(result, np.ndarray) and result.ndim == 1:
# We went through a SeriesGroupByPath and need to reshape
- # GH#32223 includes case with IntegerArray values
result = result.reshape(1, -1)
- # test_groupby_duplicate_columns gets here with
- # result.dtype == int64, values.dtype=object, how="min"
-
- return result
-
- def py_fallback(bvalues: ArrayLike) -> ArrayLike:
- # if self.grouper.aggregate fails, we fall back to a pure-python
- # solution
-
- # We get here with a) EADtypes and b) object dtype
- obj: FrameOrSeriesUnion
-
- # call our grouper again with only this block
- if isinstance(bvalues, ExtensionArray):
- # TODO(EA2D): special case not needed with 2D EAs
- obj = Series(bvalues)
- else:
- obj = DataFrame(bvalues.T)
- if obj.shape[1] == 1:
- # Avoid call to self.values that can occur in DataFrame
- # reductions; see GH#28949
- obj = obj.iloc[:, 0]
-
- # Create SeriesGroupBy with observed=True so that it does
- # not try to add missing categories if grouping over multiple
- # Categoricals. This will done by later self._reindex_output()
- # Doing it here creates an error. See GH#34951
- sgb = get_groupby(obj, self.grouper, observed=True)
- result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
-
- assert isinstance(result, (Series, DataFrame)) # for mypy
- # In the case of object dtype block, it may have been split
- # in the operation. We un-split here.
- result = result._consolidate()
- assert isinstance(result, (Series, DataFrame)) # for mypy
- assert len(result._mgr.blocks) == 1
- # unwrap DataFrame to get array
- result = result._mgr.blocks[0].values
return result
def blk_func(bvalues: ArrayLike) -> ArrayLike:
try:
- result = self.grouper._cython_operation(
- "aggregate", bvalues, how, axis=1, min_count=min_count
+ result, _ = self.grouper.aggregate(
+ bvalues, how, axis=1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
@@ -1101,7 +1078,35 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike:
assert how == "ohlc"
raise
- result = py_fallback(bvalues)
+ # We get here with a) EADtypes and b) object dtype
+ obj: FrameOrSeriesUnion
+ # call our grouper again with only this block
+ if isinstance(bvalues, ExtensionArray):
+ # TODO(EA2D): special case not needed with 2D EAs
+ obj = Series(bvalues)
+ else:
+ obj = DataFrame(bvalues.T)
+ if obj.shape[1] == 1:
+ # Avoid call to self.values that can occur in DataFrame
+ # reductions; see GH#28949
+ obj = obj.iloc[:, 0]
+
+ # Create SeriesGroupBy with observed=True so that it does
+ # not try to add missing categories if grouping over multiple
+ # Categoricals. This will done by later self._reindex_output()
+ # Doing it here creates an error. See GH#34951
+ sgb = get_groupby(obj, self.grouper, observed=True)
+ result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
+
+ assert isinstance(result, (Series, DataFrame)) # for mypy
+ # In the case of object dtype block, it may have been split
+ # in the operation. We un-split here.
+ result = result._consolidate()
+ assert isinstance(result, (Series, DataFrame)) # for mypy
+ assert len(result._mgr.blocks) == 1
+
+ # unwrap DataFrame to get array
+ result = result._mgr.blocks[0].values
return cast_agg_result(result, bvalues, how)
@@ -1145,6 +1150,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
data = obj[item]
colg = SeriesGroupBy(data, selection=item, grouper=self.grouper)
+ cast = self._transform_should_cast(func)
try:
result[item] = colg.aggregate(func, *args, **kwargs)
@@ -1157,6 +1163,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
cannot_agg.append(item)
continue
+ else:
+ if cast:
+ result[item] = maybe_cast_result(result[item], data)
+
result_columns = obj.columns
if cannot_agg:
result_columns = result_columns.drop(cannot_agg)
@@ -1274,7 +1284,7 @@ def _wrap_applied_output_series(
# as we are stacking can easily have object dtypes here
so = self._selected_obj
if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any():
- result = result._convert(datetime=True)
+ result = _recast_datetimelike_result(result)
else:
result = result._convert(datetime=True)
@@ -1826,46 +1836,40 @@ def nunique(self, dropna: bool = True) -> DataFrame:
self._insert_inaxis_grouper_inplace(results)
return results
- @Appender(DataFrame.idxmax.__doc__)
- def idxmax(self, axis=0, skipna: bool = True):
- axis = DataFrame._get_axis_number(axis)
- numeric_only = None if axis == 0 else False
-
- def func(df):
- # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
- res = df._reduce(
- nanops.nanargmax,
- "argmax",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- )
- indices = res._values
- index = df._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- return df._constructor_sliced(result, index=res.index)
-
- return self._python_apply_general(func, self._obj_with_exclusions)
-
- @Appender(DataFrame.idxmin.__doc__)
- def idxmin(self, axis=0, skipna: bool = True):
- axis = DataFrame._get_axis_number(axis)
- numeric_only = None if axis == 0 else False
-
- def func(df):
- # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
- res = df._reduce(
- nanops.nanargmin,
- "argmin",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- )
- indices = res._values
- index = df._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- return df._constructor_sliced(result, index=res.index)
+ boxplot = boxplot_frame_groupby
- return self._python_apply_general(func, self._obj_with_exclusions)
- boxplot = boxplot_frame_groupby
+def _recast_datetimelike_result(result: DataFrame) -> DataFrame:
+ """
+ If we have date/time like in the original, then coerce dates
+ as we are stacking can easily have object dtypes here.
+
+ Parameters
+ ----------
+ result : DataFrame
+
+ Returns
+ -------
+ DataFrame
+
+ Notes
+ -----
+ - Assumes Groupby._selected_obj has ndim==2 and at least one
+ datetimelike column
+ """
+ result = result.copy()
+
+ obj_cols = [
+ idx
+ for idx in range(len(result.columns))
+ if is_object_dtype(result.dtypes.iloc[idx])
+ ]
+
+ # See GH#26285
+ for n in obj_cols:
+ converted = maybe_convert_objects(
+ result.iloc[:, n].values, convert_numeric=False
+ )
+
+ result.iloc[:, n] = converted
+ return result
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 23f0e178130be..ae3612c99d5cd 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -11,7 +11,7 @@ class providing the base-class of operations.
import datetime
from functools import partial, wraps
import inspect
-from textwrap import dedent
+import re
import types
from typing import (
Callable,
@@ -45,13 +45,12 @@ class providing the base-class of operations.
IndexLabel,
Label,
Scalar,
- final,
)
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
-from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.cast import maybe_cast_result
from pandas.core.dtypes.common import (
ensure_float,
is_bool_dtype,
@@ -86,8 +85,8 @@ class providing the base-class of operations.
to each row or column of a DataFrame.
"""
-_apply_docs = {
- "template": """
+_apply_docs = dict(
+ template="""
Apply function `func` group-wise and combine the results together.
The function passed to `apply` must take a {input} as its first
@@ -124,7 +123,7 @@ class providing the base-class of operations.
Series.apply : Apply a function to a Series.
DataFrame.apply : Apply a function to each row or column of a DataFrame.
""",
- "dataframe_examples": """
+ dataframe_examples="""
>>> df = pd.DataFrame({'A': 'a a b'.split(),
'B': [1,2,3],
'C': [4,6, 5]})
@@ -164,7 +163,7 @@ class providing the base-class of operations.
b 2
dtype: int64
""",
- "series_examples": """
+ series_examples="""
>>> s = pd.Series([0, 1, 2], index='a a b'.split())
>>> g = s.groupby(s.index)
@@ -203,7 +202,7 @@ class providing the base-class of operations.
--------
{examples}
""",
-}
+)
_groupby_agg_method_template = """
Compute {fname} of group values.
@@ -449,7 +448,6 @@ class providing the base-class of operations.
"""
-@final
class GroupByPlot(PandasObject):
"""
Class implementing the .plot attribute for groupby objects.
@@ -573,11 +571,9 @@ def __init__(
self.grouper = grouper
self.exclusions = exclusions or set()
- @final
def __len__(self) -> int:
return len(self.groups)
- @final
def __repr__(self) -> str:
# TODO: Better repr for GroupBy object
return object.__repr__(self)
@@ -589,7 +585,6 @@ def _assure_grouper(self) -> None:
"""
pass
- @final
@property
def groups(self) -> Dict[Hashable, np.ndarray]:
"""
@@ -598,13 +593,11 @@ def groups(self) -> Dict[Hashable, np.ndarray]:
self._assure_grouper()
return self.grouper.groups
- @final
@property
def ngroups(self) -> int:
self._assure_grouper()
return self.grouper.ngroups
- @final
@property
def indices(self):
"""
@@ -613,7 +606,6 @@ def indices(self):
self._assure_grouper()
return self.grouper.indices
- @final
def _get_indices(self, names):
"""
Safe get multiple indices, translate keys for
@@ -664,14 +656,12 @@ def get_converter(s):
return [self.indices.get(name, []) for name in names]
- @final
def _get_index(self, name):
"""
Safe get index, translate keys for datelike to underlying repr.
"""
return self._get_indices([name])[0]
- @final
@cache_readonly
def _selected_obj(self):
# Note: _selected_obj is always just `self.obj` for SeriesGroupBy
@@ -683,7 +673,6 @@ def _selected_obj(self):
else:
return self.obj[self._selection]
- @final
def _reset_group_selection(self) -> None:
"""
Clear group based selection.
@@ -696,7 +685,6 @@ def _reset_group_selection(self) -> None:
self._group_selection = None
self._reset_cache("_selected_obj")
- @final
def _set_group_selection(self) -> None:
"""
Create group based selection.
@@ -722,7 +710,6 @@ def _set_group_selection(self) -> None:
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
self._reset_cache("_selected_obj")
- @final
def _set_result_index_ordered(
self, result: "OutputFrameOrSeries"
) -> "OutputFrameOrSeries":
@@ -739,7 +726,6 @@ def _set_result_index_ordered(
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
return result
- @final
def _dir_additions(self) -> Set[str]:
return self.obj._dir_additions() | self._apply_allowlist
@@ -755,25 +741,23 @@ def __getattr__(self, attr: str):
@Substitution(
klass="GroupBy",
- examples=dedent(
- """\
- >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
- >>> df
- A B
- 0 a 1
- 1 b 2
- 2 a 3
- 3 b 4
-
- To get the difference between each groups maximum and minimum value in one
- pass, you can do
-
- >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
- B
- A
- a 2
- b 2"""
- ),
+ examples="""\
+>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
+>>> df
+ A B
+0 a 1
+1 b 2
+2 a 3
+3 b 4
+
+To get the difference between each groups maximum and minimum value in one
+pass, you can do
+
+>>> df.groupby('A').pipe(lambda x: x.max() - x.min())
+ B
+A
+a 2
+b 2""",
)
@Appender(_pipe_template)
def pipe(self, func, *args, **kwargs):
@@ -781,7 +765,6 @@ def pipe(self, func, *args, **kwargs):
plot = property(GroupByPlot)
- @final
def _make_wrapper(self, name: str) -> Callable:
assert name in self._apply_allowlist
@@ -814,12 +797,27 @@ def curried(x):
if name in base.plotting_methods:
return self.apply(curried)
- return self._python_apply_general(curried, self._obj_with_exclusions)
+ try:
+ return self._python_apply_general(curried, self._obj_with_exclusions)
+ except TypeError as err:
+ if not re.search(
+ "reduction operation '.*' not allowed for this dtype", str(err)
+ ):
+ # We don't have a cython implementation
+ # TODO: is the above comment accurate?
+ raise
+
+ if self.obj.ndim == 1:
+ # this can be called recursively, so need to raise ValueError
+ raise ValueError
+
+ # GH#3688 try to operate item-by-item
+ result = self._aggregate_item_by_item(name, *args, **kwargs)
+ return result
wrapper.__name__ = name
return wrapper
- @final
def get_group(self, name, obj=None):
"""
Construct DataFrame from group with provided name.
@@ -906,7 +904,6 @@ def f(g):
return result
- @final
def _python_apply_general(
self, f: F, data: FrameOrSeriesUnion
) -> FrameOrSeriesUnion:
@@ -937,7 +934,6 @@ def _iterate_slices(self) -> Iterable[Series]:
def transform(self, func, *args, **kwargs):
raise AbstractMethodError(self)
- @final
def _cumcount_array(self, ascending: bool = True):
"""
Parameters
@@ -970,12 +966,24 @@ def _cumcount_array(self, ascending: bool = True):
rev[sorter] = np.arange(count, dtype=np.intp)
return out[rev].astype(np.int64, copy=False)
- @final
- def _cython_transform(
- self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
- ):
- output: Dict[base.OutputKey, np.ndarray] = {}
+ def _transform_should_cast(self, func_nm: str) -> bool:
+ """
+ Parameters
+ ----------
+ func_nm: str
+ The name of the aggregation function being performed
+ Returns
+ -------
+ bool
+ Whether transform should attempt to cast the result of aggregation
+ """
+ filled_series = self.grouper.size().fillna(0)
+ assert filled_series is not None
+ return filled_series.gt(0).any() and func_nm not in base.cython_cast_blocklist
+
+ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
+ output: Dict[base.OutputKey, np.ndarray] = {}
for idx, obj in enumerate(self._iterate_slices()):
name = obj.name
is_numeric = is_numeric_dtype(obj.dtype)
@@ -983,12 +991,13 @@ def _cython_transform(
continue
try:
- result = self.grouper._cython_operation(
- "transform", obj._values, how, axis, **kwargs
- )
+ result, _ = self.grouper.transform(obj.values, how, **kwargs)
except NotImplementedError:
continue
+ if self._transform_should_cast(how):
+ result = maybe_cast_result(result, obj, how=how)
+
key = base.OutputKey(label=name, position=idx)
output[key] = result
@@ -1008,7 +1017,6 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
raise AbstractMethodError(self)
- @final
def _agg_general(
self,
numeric_only: bool = True,
@@ -1059,22 +1067,21 @@ def _cython_agg_general(
if numeric_only and not is_numeric:
continue
- result = self.grouper._cython_operation(
- "aggregate", obj._values, how, axis=0, min_count=min_count
+ result, agg_names = self.grouper.aggregate(
+ obj._values, how, min_count=min_count
)
- if how == "ohlc":
+ if agg_names:
# e.g. ohlc
- agg_names = ["open", "high", "low", "close"]
assert len(agg_names) == result.shape[1]
for result_column, result_name in zip(result.T, agg_names):
key = base.OutputKey(label=result_name, position=idx)
- output[key] = result_column
+ output[key] = maybe_cast_result(result_column, obj, how=how)
idx += 1
else:
assert result.ndim == 1
key = base.OutputKey(label=name, position=idx)
- output[key] = result
+ output[key] = maybe_cast_result(result, obj, how=how)
idx += 1
if not output:
@@ -1082,7 +1089,6 @@ def _cython_agg_general(
return self._wrap_aggregated_output(output, index=self.grouper.result_index)
- @final
def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
"""
Perform groupby transform routine with the numba engine.
@@ -1117,7 +1123,6 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs)
# evaluated the data sorted by group
return result.take(np.argsort(sorted_index), axis=0)
- @final
def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
"""
Perform groupby aggregation routine with the numba engine.
@@ -1154,7 +1159,6 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs)
index = Index(group_keys, name=self.grouper.names[0])
return result, index
- @final
def _python_agg_general(self, func, *args, **kwargs):
func = self._is_builtin_func(func)
f = lambda x: func(x, *args, **kwargs)
@@ -1176,28 +1180,25 @@ def _python_agg_general(self, func, *args, **kwargs):
assert result is not None
key = base.OutputKey(label=name, position=idx)
+ output[key] = maybe_cast_result(result, obj, numeric_only=True)
- if is_numeric_dtype(obj.dtype):
- result = maybe_downcast_to_dtype(result, obj.dtype)
+ if not output:
+ return self._python_apply_general(f, self._selected_obj)
- if self.grouper._filter_empty_groups:
- mask = counts.ravel() > 0
+ if self.grouper._filter_empty_groups:
+
+ mask = counts.ravel() > 0
+ for key, result in output.items():
# since we are masking, make sure that we have a float object
values = result
if is_numeric_dtype(values.dtype):
values = ensure_float(values)
- result = maybe_downcast_to_dtype(values[mask], result.dtype)
-
- output[key] = result
-
- if not output:
- return self._python_apply_general(f, self._selected_obj)
+ output[key] = maybe_cast_result(values[mask], result)
return self._wrap_aggregated_output(output, index=self.grouper.result_index)
- @final
def _concat_objects(self, keys, values, not_indexed_same: bool = False):
from pandas.core.reshape.concat import concat
@@ -1211,7 +1212,7 @@ def reset_identity(values):
if not not_indexed_same:
result = concat(values, axis=self.axis)
- ax = self.filter(lambda x: True).axes[self.axis]
+ ax = self._selected_obj._get_axis(self.axis)
# this is a very unfortunate situation
# we can't use reindex to restore the original order
@@ -1259,7 +1260,6 @@ def reset_identity(values):
return result
- @final
def _apply_filter(self, indices, dropna):
if len(indices) == 0:
indices = np.array([], dtype="int64")
@@ -1349,7 +1349,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]):
more
"""
- @final
@property
def _obj_1d_constructor(self) -> Type["Series"]:
# GH28330 preserve subclassed Series/DataFrames
@@ -1358,7 +1357,6 @@ def _obj_1d_constructor(self) -> Type["Series"]:
assert isinstance(self.obj, Series)
return self.obj._constructor
- @final
def _bool_agg(self, val_test, skipna):
"""
Shared func to call any / all Cython GroupBy implementations.
@@ -1388,7 +1386,6 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
skipna=skipna,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def any(self, skipna: bool = True):
@@ -1408,7 +1405,6 @@ def any(self, skipna: bool = True):
"""
return self._bool_agg("any", skipna)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def all(self, skipna: bool = True):
@@ -1442,7 +1438,6 @@ def count(self):
# defined here for API doc
raise NotImplementedError
- @final
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def mean(self, numeric_only: bool = True):
@@ -1499,7 +1494,6 @@ def mean(self, numeric_only: bool = True):
numeric_only=numeric_only,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def median(self, numeric_only=True):
@@ -1525,7 +1519,6 @@ def median(self, numeric_only=True):
numeric_only=numeric_only,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def std(self, ddof: int = 1):
@@ -1555,7 +1548,6 @@ def std(self, ddof: int = 1):
ddof=ddof,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def var(self, ddof: int = 1):
@@ -1583,7 +1575,6 @@ def var(self, ddof: int = 1):
with group_selection_context(self):
return self._python_agg_general(func)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def sem(self, ddof: int = 1):
@@ -1614,7 +1605,6 @@ def sem(self, ddof: int = 1):
)
return result
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def size(self) -> FrameOrSeriesUnion:
@@ -1640,7 +1630,6 @@ def size(self) -> FrameOrSeriesUnion:
return self._reindex_output(result, fill_value=0)
- @final
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
def sum(self, numeric_only: bool = True, min_count: int = 0):
@@ -1657,28 +1646,24 @@ def sum(self, numeric_only: bool = True, min_count: int = 0):
return self._reindex_output(result, fill_value=0)
- @final
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
def prod(self, numeric_only: bool = True, min_count: int = 0):
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
)
- @final
@doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
def min(self, numeric_only: bool = False, min_count: int = -1):
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min
)
- @final
@doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
def max(self, numeric_only: bool = False, min_count: int = -1):
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max
)
- @final
@doc(_groupby_agg_method_template, fname="first", no=False, mc=-1)
def first(self, numeric_only: bool = False, min_count: int = -1):
def first_compat(obj: FrameOrSeries, axis: int = 0):
@@ -1703,7 +1688,6 @@ def first(x: Series):
npfunc=first_compat,
)
- @final
@doc(_groupby_agg_method_template, fname="last", no=False, mc=-1)
def last(self, numeric_only: bool = False, min_count: int = -1):
def last_compat(obj: FrameOrSeries, axis: int = 0):
@@ -1728,7 +1712,6 @@ def last(x: Series):
npfunc=last_compat,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def ohlc(self) -> DataFrame:
@@ -1744,7 +1727,6 @@ def ohlc(self) -> DataFrame:
"""
return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc"))
- @final
@doc(DataFrame.describe)
def describe(self, **kwargs):
with group_selection_context(self):
@@ -1753,7 +1735,6 @@ def describe(self, **kwargs):
return result.T
return result.unstack()
- @final
def resample(self, rule, *args, **kwargs):
"""
Provide resampling when using a TimeGrouper.
@@ -1855,7 +1836,6 @@ def resample(self, rule, *args, **kwargs):
return get_resampler_for_grouping(self, rule, *args, **kwargs)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def rolling(self, *args, **kwargs):
@@ -1866,7 +1846,6 @@ def rolling(self, *args, **kwargs):
return RollingGroupby(self, *args, **kwargs)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def expanding(self, *args, **kwargs):
@@ -1878,7 +1857,6 @@ def expanding(self, *args, **kwargs):
return ExpandingGroupby(self, *args, **kwargs)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def ewm(self, *args, **kwargs):
@@ -1889,7 +1867,6 @@ def ewm(self, *args, **kwargs):
return ExponentialMovingWindowGroupby(self, *args, **kwargs)
- @final
def _fill(self, direction, limit=None):
"""
Shared function for `pad` and `backfill` to call Cython method.
@@ -1928,7 +1905,6 @@ def _fill(self, direction, limit=None):
dropna=self.dropna,
)
- @final
@Substitution(name="groupby")
def pad(self, limit=None):
"""
@@ -1955,7 +1931,6 @@ def pad(self, limit=None):
ffill = pad
- @final
@Substitution(name="groupby")
def backfill(self, limit=None):
"""
@@ -1982,7 +1957,6 @@ def backfill(self, limit=None):
bfill = backfill
- @final
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame:
@@ -2156,7 +2130,6 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
return result
- @final
def quantile(self, q=0.5, interpolation: str = "linear"):
"""
Return group values at the given quantile, a la numpy.percentile.
@@ -2254,38 +2227,30 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
)
for qi in q
]
- result = concat(results, axis=self.axis, keys=q)
+ result = concat(results, axis=0, keys=q)
# fix levels to place quantiles on the inside
# TODO(GH-10710): Ideally, we could write this as
# >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :]
# but this hits https://github.com/pandas-dev/pandas/issues/10710
# which doesn't reorder the list-like `q` on the inner level.
- order = list(range(1, result.axes[self.axis].nlevels)) + [0]
+ order = list(range(1, result.index.nlevels)) + [0]
# temporarily saves the index names
- index_names = np.array(result.axes[self.axis].names)
+ index_names = np.array(result.index.names)
# set index names to positions to avoid confusion
- result.axes[self.axis].names = np.arange(len(index_names))
+ result.index.names = np.arange(len(index_names))
# place quantiles on the inside
- if isinstance(result, Series):
- result = result.reorder_levels(order)
- else:
- result = result.reorder_levels(order, axis=self.axis)
+ result = result.reorder_levels(order)
# restore the index names in order
- result.axes[self.axis].names = index_names[order]
+ result.index.names = index_names[order]
# reorder rows to keep things sorted
- indices = (
- np.arange(result.shape[self.axis])
- .reshape([len(q), self.ngroups])
- .T.flatten()
- )
- return result.take(indices, axis=self.axis)
+ indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten()
+ return result.take(indices)
- @final
@Substitution(name="groupby")
def ngroup(self, ascending: bool = True):
"""
@@ -2353,7 +2318,6 @@ def ngroup(self, ascending: bool = True):
result = self.ngroups - 1 - result
return result
- @final
@Substitution(name="groupby")
def cumcount(self, ascending: bool = True):
"""
@@ -2413,7 +2377,6 @@ def cumcount(self, ascending: bool = True):
cumcounts = self._cumcount_array(ascending=ascending)
return self._obj_1d_constructor(cumcounts, index)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def rank(
@@ -2463,7 +2426,6 @@ def rank(
axis=axis,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def cumprod(self, axis=0, *args, **kwargs):
@@ -2480,7 +2442,6 @@ def cumprod(self, axis=0, *args, **kwargs):
return self._cython_transform("cumprod", **kwargs)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def cumsum(self, axis=0, *args, **kwargs):
@@ -2497,7 +2458,6 @@ def cumsum(self, axis=0, *args, **kwargs):
return self._cython_transform("cumsum", **kwargs)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def cummin(self, axis=0, **kwargs):
@@ -2513,7 +2473,6 @@ def cummin(self, axis=0, **kwargs):
return self._cython_transform("cummin", numeric_only=False)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def cummax(self, axis=0, **kwargs):
@@ -2529,7 +2488,6 @@ def cummax(self, axis=0, **kwargs):
return self._cython_transform("cummax", numeric_only=False)
- @final
def _get_cythonized_result(
self,
how: str,
@@ -2688,7 +2646,6 @@ def _get_cythonized_result(
else:
return self._wrap_transformed_output(output)
- @final
@Substitution(name="groupby")
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
"""
@@ -2732,7 +2689,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
periods=periods,
)
- @final
@Substitution(name="groupby")
@Appender(_common_see_also)
def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
@@ -2762,7 +2718,6 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0
shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
return (filled / shifted) - 1
- @final
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def head(self, n=5):
@@ -2800,7 +2755,6 @@ def head(self, n=5):
else:
return self._selected_obj.iloc[:, mask]
- @final
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def tail(self, n=5):
@@ -2838,7 +2792,6 @@ def tail(self, n=5):
else:
return self._selected_obj.iloc[:, mask]
- @final
def _reindex_output(
self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN
) -> OutputFrameOrSeries:
@@ -2925,7 +2878,6 @@ def _reindex_output(
return output.reset_index(drop=True)
- @final
def sample(
self,
n: Optional[int] = None,
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 496aa6f327096..261190747ee61 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -7,7 +7,7 @@
import numpy as np
-from pandas._typing import FrameOrSeries, Label, final
+from pandas._typing import FrameOrSeries, Label
from pandas.errors import InvalidIndexError
from pandas.util._decorators import cache_readonly
@@ -18,6 +18,7 @@
is_scalar,
is_timedelta64_dtype,
)
+from pandas.core.dtypes.generic import ABCSeries
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical, ExtensionArray
@@ -314,7 +315,6 @@ def __init__(
self._grouper = None
self.dropna = dropna
- @final
@property
def ax(self):
return self.grouper
@@ -346,7 +346,6 @@ def _get_grouper(self, obj, validate: bool = True):
)
return self.binner, self.grouper, self.obj
- @final
def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
"""
given an object and the specifications, setup the internal grouper
@@ -371,7 +370,9 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
if self.key is not None:
key = self.key
# The 'on' is already defined
- if getattr(self.grouper, "name", None) == key and isinstance(obj, Series):
+ if getattr(self.grouper, "name", None) == key and isinstance(
+ obj, ABCSeries
+ ):
# pandas\core\groupby\grouper.py:348: error: Item "None" of
# "Optional[Any]" has no attribute "take" [union-attr]
ax = self._grouper.take(obj.index) # type: ignore[union-attr]
@@ -406,14 +407,12 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
self.grouper = ax
return self.grouper
- @final
@property
def groups(self):
# pandas\core\groupby\grouper.py:382: error: Item "None" of
# "Optional[Any]" has no attribute "groups" [union-attr]
return self.grouper.groups # type: ignore[union-attr]
- @final
def __repr__(self) -> str:
attrs_list = (
f"{attr_name}={repr(getattr(self, attr_name))}"
@@ -425,7 +424,6 @@ def __repr__(self) -> str:
return f"{cls_name}({attrs})"
-@final
class Grouping:
"""
Holds the grouping information for a single key
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index c60a59916affc..50c4cc53a12bb 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -24,15 +24,11 @@
from pandas._libs import NaT, iNaT, lib
import pandas._libs.groupby as libgroupby
import pandas._libs.reduction as libreduction
-from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape, final
+from pandas._typing import F, FrameOrSeries, Label, Shape
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
-from pandas.core.dtypes.cast import (
- maybe_cast_result,
- maybe_cast_result_dtype,
- maybe_downcast_to_dtype,
-)
+from pandas.core.dtypes.cast import maybe_cast_result
from pandas.core.dtypes.common import (
ensure_float,
ensure_float64,
@@ -146,7 +142,6 @@ def get_iterator(
for key, (i, group) in zip(keys, splitter):
yield key, group.__finalize__(data, method="groupby")
- @final
def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter":
"""
Returns
@@ -167,7 +162,6 @@ def _get_grouper(self):
"""
return self.groupings[0].grouper
- @final
def _get_group_keys(self):
if len(self.groupings) == 1:
return self.levels[0]
@@ -177,7 +171,6 @@ def _get_group_keys(self):
# provide "flattened" iterator for multi-group setting
return get_flattened_list(comp_ids, ngroups, self.levels, self.codes)
- @final
def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
mutated = self.mutated
splitter = self._get_splitter(data, axis=axis)
@@ -259,7 +252,6 @@ def levels(self) -> List[Index]:
def names(self) -> List[Label]:
return [ping.name for ping in self.groupings]
- @final
def size(self) -> Series:
"""
Compute group sizes.
@@ -282,7 +274,6 @@ def groups(self) -> Dict[Hashable, np.ndarray]:
to_groupby = Index(to_groupby)
return self.axis.groupby(to_groupby)
- @final
@cache_readonly
def is_monotonic(self) -> bool:
# return if my group orderings are monotonic
@@ -296,7 +287,6 @@ def group_info(self):
comp_ids = ensure_int64(comp_ids)
return comp_ids, obs_group_ids, ngroups
- @final
@cache_readonly
def codes_info(self) -> np.ndarray:
# return the codes of items in original grouped axis
@@ -306,7 +296,6 @@ def codes_info(self) -> np.ndarray:
codes = codes[sorter]
return codes
- @final
def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]:
all_codes = self.codes
if len(all_codes) > 1:
@@ -316,7 +305,6 @@ def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]:
ping = self.groupings[0]
return ping.codes, np.arange(len(ping.group_index))
- @final
@cache_readonly
def ngroups(self) -> int:
return len(self.result_index)
@@ -338,7 +326,6 @@ def result_index(self) -> Index:
levels=levels, codes=codes, verify_integrity=False, names=self.names
)
- @final
def get_group_levels(self) -> List[Index]:
if not self.compressed and len(self.groupings) == 1:
return [self.groupings[0].result_index]
@@ -379,7 +366,8 @@ def get_group_levels(self) -> List[Index]:
_cython_arity = {"ohlc": 4} # OHLC
- @final
+ _name_functions = {"ohlc": ["open", "high", "low", "close"]}
+
def _is_builtin_func(self, arg):
"""
if we define a builtin function for this argument, return it,
@@ -387,7 +375,6 @@ def _is_builtin_func(self, arg):
"""
return SelectionMixin._builtin_table.get(arg, arg)
- @final
def _get_cython_function(
self, kind: str, how: str, values: np.ndarray, is_numeric: bool
):
@@ -424,7 +411,6 @@ def _get_cython_function(
return func
- @final
def _get_cython_func_and_vals(
self, kind: str, how: str, values: np.ndarray, is_numeric: bool
):
@@ -459,82 +445,17 @@ def _get_cython_func_and_vals(
raise
return func, values
- @final
- def _disallow_invalid_ops(self, values: ArrayLike, how: str):
- """
- Check if we can do this operation with our cython functions.
-
- Raises
- ------
- NotImplementedError
- This is either not a valid function for this dtype, or
- valid but not implemented in cython.
- """
- dtype = values.dtype
-
- if is_categorical_dtype(dtype) or is_sparse(dtype):
- # categoricals are only 1d, so we
- # are not setup for dim transforming
- raise NotImplementedError(f"{dtype} dtype not supported")
- elif is_datetime64_any_dtype(dtype):
- # we raise NotImplemented if this is an invalid operation
- # entirely, e.g. adding datetimes
- if how in ["add", "prod", "cumsum", "cumprod"]:
- raise NotImplementedError(
- f"datetime64 type does not support {how} operations"
- )
- elif is_timedelta64_dtype(dtype):
- if how in ["prod", "cumprod"]:
- raise NotImplementedError(
- f"timedelta64 type does not support {how} operations"
- )
-
- @final
- def _ea_wrap_cython_operation(
+ def _cython_operation(
self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
) -> Tuple[np.ndarray, Optional[List[str]]]:
"""
- If we have an ExtensionArray, unwrap, call _cython_operation, and
- re-wrap if appropriate.
- """
- # TODO: general case implementation overrideable by EAs.
- orig_values = values
-
- if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
- # All of the functions implemented here are ordinal, so we can
- # operate on the tz-naive equivalents
- values = values.view("M8[ns]")
- res_values = self._cython_operation(
- kind, values, how, axis, min_count, **kwargs
- )
- if how in ["rank"]:
- # preserve float64 dtype
- return res_values
-
- res_values = res_values.astype("i8", copy=False)
- result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype)
- return result
-
- elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
- # IntegerArray or BooleanArray
- values = ensure_int_or_float(values)
- res_values = self._cython_operation(
- kind, values, how, axis, min_count, **kwargs
- )
- result = maybe_cast_result(result=res_values, obj=orig_values, how=how)
- return result
-
- raise NotImplementedError(values.dtype)
+ Returns the values of a cython operation as a Tuple of [data, names].
- @final
- def _cython_operation(
- self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
- ) -> np.ndarray:
+ Names is only useful when dealing with 2D results, like ohlc
+ (see self._name_functions).
"""
- Returns the values of a cython operation.
- """
- orig_values = values
assert kind in ["transform", "aggregate"]
+ orig_values = values
if values.ndim > 2:
raise NotImplementedError("number of dimensions is currently limited to 2")
@@ -545,12 +466,30 @@ def _cython_operation(
# can we do this operation with our cython functions
# if not raise NotImplementedError
- self._disallow_invalid_ops(values, how)
- if is_extension_array_dtype(values.dtype):
- return self._ea_wrap_cython_operation(
- kind, values, how, axis, min_count, **kwargs
- )
+ # we raise NotImplemented if this is an invalid operation
+ # entirely, e.g. adding datetimes
+
+ # categoricals are only 1d, so we
+ # are not setup for dim transforming
+ if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
+ raise NotImplementedError(f"{values.dtype} dtype not supported")
+ elif is_datetime64_any_dtype(values.dtype):
+ if how in ["add", "prod", "cumsum", "cumprod"]:
+ raise NotImplementedError(
+ f"datetime64 type does not support {how} operations"
+ )
+ elif is_timedelta64_dtype(values.dtype):
+ if how in ["prod", "cumprod"]:
+ raise NotImplementedError(
+ f"timedelta64 type does not support {how} operations"
+ )
+
+ if is_datetime64tz_dtype(values.dtype):
+ # Cast to naive; we'll cast back at the end of the function
+ # TODO: possible need to reshape?
+ # TODO(EA2D):kludge can be avoided when 2D EA is allowed.
+ values = values.view("M8[ns]")
is_datetimelike = needs_i8_conversion(values.dtype)
is_numeric = is_numeric_dtype(values.dtype)
@@ -629,18 +568,36 @@ def _cython_operation(
if vdim == 1 and arity == 1:
result = result[:, 0]
+ names: Optional[List[str]] = self._name_functions.get(how, None)
+
if swapped:
result = result.swapaxes(0, axis)
- if how not in base.cython_cast_blocklist:
- # e.g. if we are int64 and need to restore to datetime64/timedelta64
- # "rank" is the only member of cython_cast_blocklist we get here
- dtype = maybe_cast_result_dtype(orig_values.dtype, how)
- result = maybe_downcast_to_dtype(result, dtype)
+ if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
+ orig_values.dtype
+ ):
+ # We need to use the constructors directly for these dtypes
+ # since numpy won't recognize them
+ # https://github.com/pandas-dev/pandas/issues/31471
+ result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
+ elif is_datetimelike and kind == "aggregate":
+ result = result.astype(orig_values.dtype)
+
+ if is_extension_array_dtype(orig_values.dtype):
+ result = maybe_cast_result(result=result, obj=orig_values, how=how)
- return result
+ return result, names
+
+ def aggregate(
+ self, values, how: str, axis: int = 0, min_count: int = -1
+ ) -> Tuple[np.ndarray, Optional[List[str]]]:
+ return self._cython_operation(
+ "aggregate", values, how, axis, min_count=min_count
+ )
+
+ def transform(self, values, how: str, axis: int = 0, **kwargs):
+ return self._cython_operation("transform", values, how, axis, **kwargs)
- @final
def _aggregate(
self, result, counts, values, comp_ids, agg_func, min_count: int = -1
):
@@ -652,7 +609,6 @@ def _aggregate(
return result
- @final
def _transform(
self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs
):
@@ -691,7 +647,6 @@ def agg_series(self, obj: Series, func: F):
raise
return self._aggregate_series_pure_python(obj, func)
- @final
def _aggregate_series_fast(self, obj: Series, func: F):
# At this point we have already checked that
# - obj.index is not a MultiIndex
@@ -711,7 +666,6 @@ def _aggregate_series_fast(self, obj: Series, func: F):
result, counts = grouper.get_result()
return result, counts
- @final
def _aggregate_series_pure_python(self, obj: Series, func: F):
group_index, _, ngroups = self.group_info
@@ -737,7 +691,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F):
result[label] = res
result = lib.maybe_convert_objects(result, try_float=0)
- result = maybe_cast_result(result, obj, numeric_only=True)
+ # TODO: maybe_cast_to_extension_array?
return result, counts
diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py
index da4654bbf2c10..b6713bc760c5e 100644
--- a/pandas/core/indexers.py
+++ b/pandas/core/indexers.py
@@ -79,9 +79,6 @@ def is_scalar_indexer(indexer, ndim: int) -> bool:
-------
bool
"""
- if ndim == 1 and is_integer(indexer):
- # GH37748: allow indexer to be an integer for Series
- return True
if isinstance(indexer, tuple):
if len(indexer) == ndim:
return all(
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index ba958b23e81af..c49f3f9457161 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -33,7 +33,6 @@
from pandas.util._decorators import Appender, cache_readonly, doc
from pandas.core.dtypes.cast import (
- find_common_type,
maybe_cast_to_integer_array,
validate_numeric_casting,
)
@@ -70,6 +69,7 @@
ABCMultiIndex,
ABCPandasArray,
ABCPeriodIndex,
+ ABCRangeIndex,
ABCSeries,
ABCTimedeltaIndex,
)
@@ -106,15 +106,15 @@
_unsortable_types = frozenset(("mixed", "mixed-integer"))
-_index_doc_kwargs = {
- "klass": "Index",
- "inplace": "",
- "target_klass": "Index",
- "raises_section": "",
- "unique": "Index",
- "duplicated": "np.ndarray",
-}
-_index_shared_docs = {}
+_index_doc_kwargs = dict(
+ klass="Index",
+ inplace="",
+ target_klass="Index",
+ raises_section="",
+ unique="Index",
+ duplicated="np.ndarray",
+)
+_index_shared_docs = dict()
str_t = str
@@ -817,7 +817,7 @@ def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool:
@Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
def repeat(self, repeats, axis=None):
repeats = ensure_platform_int(repeats)
- nv.validate_repeat(tuple(), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
return self._shallow_copy(self._values.repeat(repeats))
# --------------------------------------------------------------------
@@ -2155,7 +2155,7 @@ def is_all_dates(self):
# Pickle Methods
def __reduce__(self):
- d = {"data": self._data}
+ d = dict(data=self._data)
d.update(self._get_attributes_dict())
return _new_Index, (type(self), d), None
@@ -2379,10 +2379,6 @@ def unique(self, level=None):
"""
if level is not None:
self._validate_index_level(level)
-
- if self.is_unique:
- return self._shallow_copy()
-
result = super().unique()
return self._shallow_copy(result)
@@ -2694,7 +2690,7 @@ def union(self, other, sort=None):
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
+ other = ensure_index(other)
if not self._can_union_without_object_cast(other):
return self._union_incompatible_dtypes(other, sort=sort)
@@ -2824,15 +2820,14 @@ def intersection(self, other, sort=False):
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, _ = self._convert_can_do_setop(other)
+ other = ensure_index(other)
- if self.equals(other) and not self.has_duplicates:
+ if self.equals(other):
return self._get_reconciled_name_object(other)
if not is_dtype_equal(self.dtype, other.dtype):
- dtype = find_common_type([self.dtype, other.dtype])
- this = self.astype(dtype, copy=False)
- other = other.astype(dtype, copy=False)
+ this = self.astype("O")
+ other = other.astype("O")
return this.intersection(other, sort=sort)
result = self._intersection(other, sort=sort)
@@ -2852,7 +2847,7 @@ def _intersection(self, other, sort=False):
except TypeError:
pass
else:
- return algos.unique1d(result)
+ return result
try:
indexer = Index(rvals).get_indexer(lvals)
@@ -2863,14 +2858,11 @@ def _intersection(self, other, sort=False):
indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0])
indexer = indexer[indexer != -1]
- result = other.take(indexer).unique()._values
+ result = other.take(indexer)._values
if sort is None:
result = algos.safe_sort(result)
- # Intersection has to be unique
- assert Index(result).is_unique
-
return result
def difference(self, other, sort=None):
@@ -2913,15 +2905,12 @@ def difference(self, other, sort=None):
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
if self.equals(other):
- return self[:0].rename(result_name)
+ # pass an empty np.ndarray with the appropriate dtype
+ return self._shallow_copy(self._data[:0])
- result = self._difference(other, sort=sort)
- return self._wrap_setop_result(other, result)
-
- def _difference(self, other, sort):
+ other, result_name = self._convert_can_do_setop(other)
this = self._get_unique_index()
@@ -2936,7 +2925,7 @@ def _difference(self, other, sort):
except TypeError:
pass
- return the_diff
+ return this._shallow_copy(the_diff, name=result_name)
def symmetric_difference(self, other, result_name=None, sort=None):
"""
@@ -3496,7 +3485,12 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
target = ensure_has_len(target) # target may be an iterator
if not isinstance(target, Index) and len(target) == 0:
- target = self[:0]
+ values: Union[range, ExtensionArray, np.ndarray]
+ if isinstance(self, ABCRangeIndex):
+ values = range(0)
+ else:
+ values = self._data[:0] # appropriately-dtyped empty array
+ target = self._simple_new(values, name=self.name)
else:
target = ensure_index(target)
@@ -3826,7 +3820,6 @@ def _join_non_unique(self, other, how="left", return_indexers=False):
else:
return join_index
- @final
def _join_level(
self, other, level, how="left", return_indexers=False, keep_order=True
):
@@ -3970,7 +3963,6 @@ def _get_leaf_sorter(labels):
else:
return join_index
- @final
def _join_monotonic(self, other, how="left", return_indexers=False):
# We only get here with matching dtypes
assert other.dtype == self.dtype
@@ -4188,6 +4180,12 @@ def _coerce_scalar_to_index(self, item):
return Index([item], dtype=dtype, **self._get_attributes_dict())
+ def _to_safe_for_reshape(self):
+ """
+ Convert to object if we are a categorical.
+ """
+ return self
+
def _validate_fill_value(self, value):
"""
Check if the value can be inserted into our array, and convert
@@ -4738,10 +4736,7 @@ def shift(self, periods=1, freq=None):
'2012-03-01'],
dtype='datetime64[ns]', freq='MS')
"""
- raise NotImplementedError(
- f"This method is only implemented for DatetimeIndex, PeriodIndex and "
- f"TimedeltaIndex; Got type {type(self).__name__}"
- )
+ raise NotImplementedError(f"Not supported for type {type(self).__name__}")
def argsort(self, *args, **kwargs) -> np.ndarray:
"""
@@ -4906,31 +4901,16 @@ def get_indexer_non_unique(self, target):
# Treat boolean labels passed to a numeric index as not found. Without
# this fix False and True would be treated as 0 and 1 respectively.
# (GH #16877)
- return self._get_indexer_non_comparable(target, method=None, unique=False)
+ no_matches = -1 * np.ones(self.shape, dtype=np.intp)
+ return no_matches, no_matches
pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer_non_unique(ptarget)
- if not self._should_compare(target):
- return self._get_indexer_non_comparable(target, method=None, unique=False)
-
- if not is_dtype_equal(self.dtype, target.dtype):
- # TODO: if object, could use infer_dtype to pre-empt costly
- # conversion if still non-comparable?
- dtype = find_common_type([self.dtype, target.dtype])
- if (
- dtype.kind in ["i", "u"]
- and is_categorical_dtype(target.dtype)
- and target.hasnans
- ):
- # FIXME: find_common_type incorrect with Categorical GH#38240
- # FIXME: some cases where float64 cast can be lossy?
- dtype = np.dtype(np.float64)
-
- this = self.astype(dtype, copy=False)
- that = target.astype(dtype, copy=False)
- return this.get_indexer_non_unique(that)
+ if not self._is_comparable_dtype(target.dtype):
+ no_matches = -1 * np.ones(self.shape, dtype=np.intp)
+ return no_matches, no_matches
if is_categorical_dtype(target.dtype):
tgt_values = np.asarray(target)
@@ -4958,43 +4938,6 @@ def get_indexer_for(self, target, **kwargs):
indexer, _ = self.get_indexer_non_unique(target)
return indexer
- def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True):
- """
- Called from get_indexer or get_indexer_non_unique when the target
- is of a non-comparable dtype.
-
- For get_indexer lookups with method=None, get_indexer is an _equality_
- check, so non-comparable dtypes mean we will always have no matches.
-
- For get_indexer lookups with a method, get_indexer is an _inequality_
- check, so non-comparable dtypes mean we will always raise TypeError.
-
- Parameters
- ----------
- target : Index
- method : str or None
- unique : bool, default True
- * True if called from get_indexer.
- * False if called from get_indexer_non_unique.
-
- Raises
- ------
- TypeError
- If doing an inequality check, i.e. method is not None.
- """
- if method is not None:
- other = unpack_nested_dtype(target)
- raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
-
- no_matches = -1 * np.ones(target.shape, dtype=np.intp)
- if unique:
- # This is for get_indexer
- return no_matches
- else:
- # This is for get_indexer_non_unique
- missing = np.arange(len(target), dtype=np.intp)
- return no_matches, missing
-
@property
def _index_as_unique(self):
"""
@@ -5030,14 +4973,6 @@ def _maybe_promote(self, other: "Index"):
return self, other
- def _should_compare(self, other: "Index") -> bool:
- """
- Check if `self == other` can ever have non-False entries.
- """
- other = unpack_nested_dtype(other)
- dtype = other.dtype
- return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
-
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
@@ -5581,7 +5516,7 @@ def drop(self, labels, errors: str_t = "raise"):
"""
arr_dtype = "object" if self.dtype == "object" else None
labels = com.index_labels_to_array(labels, dtype=arr_dtype)
- indexer = self.get_indexer_for(labels)
+ indexer = self.get_indexer(labels)
mask = indexer == -1
if mask.any():
if errors != "ignore":
@@ -6185,24 +6120,3 @@ def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
return names
-
-
-def unpack_nested_dtype(other: Index) -> Index:
- """
- When checking if our dtype is comparable with another, we need
- to unpack CategoricalDtype to look at its categories.dtype.
-
- Parameters
- ----------
- other : Index
-
- Returns
- -------
- Index
- """
- dtype = other.dtype
- if is_categorical_dtype(dtype):
- # If there is ever a SparseIndex, this could get dispatched
- # here too.
- return dtype.categories
- return other
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 377fff5f85e92..e2507aeaeb652 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -27,7 +27,7 @@
import pandas.core.missing as missing
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
+_index_doc_kwargs.update(dict(target_klass="CategoricalIndex"))
@inherit_names(
@@ -399,6 +399,10 @@ def unique(self, level=None):
# of result, not self.
return type(self)._simple_new(result, name=self.name)
+ def _to_safe_for_reshape(self):
+ """ convert to object if we are a categorical """
+ return self.astype("object")
+
def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
"""
Create index with target's values (move/add/delete values as necessary)
@@ -554,9 +558,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind):
# --------------------------------------------------------------------
- def _is_comparable_dtype(self, dtype):
- return self.categories._is_comparable_dtype(dtype)
-
def take_nd(self, *args, **kwargs):
"""Alias for `take`"""
warnings.warn(
@@ -636,19 +637,11 @@ def map(self, mapper):
mapped = self._values.map(mapper)
return Index(mapped, name=self.name)
- def _concat(self, to_concat: List["Index"], name: Label) -> Index:
+ def _concat(self, to_concat: List["Index"], name: Label) -> "CategoricalIndex":
# if calling index is category, don't check dtype of others
- try:
- codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
- except TypeError:
- # not all to_concat elements are among our categories (or NA)
- from pandas.core.dtypes.concat import concat_compat
-
- res = concat_compat(to_concat)
- return Index(res, name=name)
- else:
- cat = self._data._from_backing_data(codes)
- return type(self)._simple_new(cat, name=name)
+ codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
+ cat = self._data._from_backing_data(codes)
+ return type(self)._simple_new(cat, name=name)
def _delegate_method(self, name: str, *args, **kwargs):
""" method delegation to the ._values """
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index f0d4d36531e0d..1b18f04ba603d 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -200,7 +200,7 @@ def __contains__(self, key: Any) -> bool:
@Appender(_index_shared_docs["take"] % _index_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
- nv.validate_take((), kwargs)
+ nv.validate_take(tuple(), kwargs)
indices = np.asarray(indices, dtype=np.intp)
maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
@@ -686,19 +686,10 @@ def intersection(self, other, sort=False):
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, _ = self._convert_can_do_setop(other)
if self.equals(other):
- if self.has_duplicates:
- return self.unique()._get_reconciled_name_object(other)
return self._get_reconciled_name_object(other)
- return self._intersection(other, sort=sort)
-
- def _intersection(self, other: Index, sort=False) -> Index:
- """
- intersection specialized to the case with matching dtypes.
- """
if len(self) == 0:
return self.copy()._get_reconciled_name_object(other)
if len(other) == 0:
@@ -706,14 +697,17 @@ def _intersection(self, other: Index, sort=False) -> Index:
if not isinstance(other, type(self)):
result = Index.intersection(self, other, sort=sort)
+ if isinstance(result, type(self)):
+ if result.freq is None:
+ # TODO: no tests rely on this; needed?
+ result = result._with_freq("infer")
return result
elif not self._can_fast_intersect(other):
- result = Index._intersection(self, other, sort=sort)
- # We need to invalidate the freq because Index._intersection
+ result = Index.intersection(self, other, sort=sort)
+ # We need to invalidate the freq because Index.intersection
# uses _shallow_copy on a view of self._data, which will preserve
# self.freq if we're not careful.
- result = self._wrap_setop_result(other, result)
return result._with_freq(None)._with_freq("infer")
# to make our life easier, "sort" the two ranges
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 8329c41a74596..f6eeb121b1ac0 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -227,7 +227,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin):
_is_numeric_dtype = False
_data: DatetimeArray
- inferred_freq: Optional[str]
tz: Optional[tzinfo]
# --------------------------------------------------------------------
@@ -338,7 +337,7 @@ def __reduce__(self):
# we use a special reduce here because we need
# to simply set the .tz (and not reinterpret it)
- d = {"data": self._data}
+ d = dict(data=self._data)
d.update(self._get_attributes_dict())
return _new_DatetimeIndex, (type(self), d), None
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index 92bd82f8263e9..3f146e273326c 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -273,7 +273,7 @@ def _get_engine_target(self) -> np.ndarray:
return np.asarray(self._data)
def repeat(self, repeats, axis=None):
- nv.validate_repeat((), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
result = self._data.repeat(repeats, axis=axis)
return type(self)._simple_new(result, name=self.name)
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 2f86d9c20bfe8..ed92b3dade6a0 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -11,7 +11,7 @@
from pandas._libs import lib
from pandas._libs.interval import Interval, IntervalMixin, IntervalTree
from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset
-from pandas._typing import AnyArrayLike, DtypeObj, Label
+from pandas._typing import AnyArrayLike, Label
from pandas.errors import InvalidIndexError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._exceptions import rewrite_exception
@@ -38,7 +38,6 @@
is_object_dtype,
is_scalar,
)
-from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.algorithms import take_1d
from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs
@@ -51,7 +50,6 @@
default_pprint,
ensure_index,
maybe_extract_name,
- unpack_nested_dtype,
)
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
from pandas.core.indexes.extension import ExtensionIndex, inherit_names
@@ -124,9 +122,8 @@ def setop_check(method):
@wraps(method)
def wrapped(self, other, sort=False):
- self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, _ = self._convert_can_do_setop(other)
+ other = ensure_index(other)
if not isinstance(other, IntervalIndex):
result = getattr(self.astype(object), op_name)(other)
@@ -134,6 +131,14 @@ def wrapped(self, other, sort=False):
result = result.astype(self.dtype)
return result
+ if self._is_non_comparable_own_type(other):
+ # GH#19016: ensure set op will not return a prohibited dtype
+ raise TypeError(
+ "can only do set operations between two IntervalIndex "
+ "objects that are closed on the same side "
+ "and have compatible dtypes"
+ )
+
return method(self, other, sort)
return wrapped
@@ -805,19 +810,6 @@ def _convert_list_indexer(self, keyarr):
return locs
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- if not isinstance(dtype, IntervalDtype):
- return False
- common_subtype = find_common_type([self.dtype.subtype, dtype.subtype])
- return not is_object_dtype(common_subtype)
-
- def _should_compare(self, other) -> bool:
- if not super()._should_compare(other):
- return False
- other = unpack_nested_dtype(other)
- return other.closed == self.closed
-
- # TODO: use should_compare and get rid of _is_non_comparable_own_type
def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool:
# different closed or incompatible subtype -> no matches
@@ -825,7 +817,8 @@ def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool:
# is_comparable_dtype GH#19371
if self.closed != other.closed:
return True
- return not self._is_comparable_dtype(other.dtype)
+ common_subtype = find_common_type([self.dtype.subtype, other.dtype.subtype])
+ return is_object_dtype(common_subtype)
# --------------------------------------------------------------------
@@ -963,37 +956,11 @@ def _format_space(self) -> str:
# --------------------------------------------------------------------
# Set Operations
- def _assert_can_do_setop(self, other):
- super()._assert_can_do_setop(other)
-
- if isinstance(other, IntervalIndex) and self._is_non_comparable_own_type(other):
- # GH#19016: ensure set op will not return a prohibited dtype
- raise TypeError(
- "can only do set operations between two IntervalIndex "
- "objects that are closed on the same side "
- "and have compatible dtypes"
- )
-
@Appender(Index.intersection.__doc__)
- def intersection(self, other, sort=False) -> Index:
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, _ = self._convert_can_do_setop(other)
-
- if self.equals(other) and not self.has_duplicates:
- return self._get_reconciled_name_object(other)
-
- if not isinstance(other, IntervalIndex):
- return self.astype(object).intersection(other)
-
- result = self._intersection(other, sort=sort)
- return self._wrap_setop_result(other, result)
-
- def _intersection(self, other, sort):
- """
- intersection specialized to the case with matching dtypes.
- """
- # For IntervalIndex we also know other.closed == self.closed
+ @setop_check
+ def intersection(
+ self, other: "IntervalIndex", sort: bool = False
+ ) -> "IntervalIndex":
if self.left.is_unique and self.right.is_unique:
taken = self._intersection_unique(other)
elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1:
@@ -1007,7 +974,7 @@ def _intersection(self, other, sort):
if sort is None:
taken = taken.sort_values()
- return taken
+ return self._wrap_setop_result(other, taken)
def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex":
"""
@@ -1060,10 +1027,6 @@ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex":
def _setop(op_name: str, sort=None):
def func(self, other, sort=sort):
- # At this point we are assured
- # isinstance(other, IntervalIndex)
- # other.closed == self.closed
-
result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort)
result_name = get_op_result_name(self, other)
@@ -1078,7 +1041,7 @@ def func(self, other, sort=sort):
func.__name__ = op_name
return setop_check(func)
- _union = _setop("union")
+ union = _setop("union")
difference = _setop("difference")
symmetric_difference = _setop("symmetric_difference")
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index fd47c23b7c92b..9b4b459d9a122 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -20,7 +20,7 @@
from pandas._libs import algos as libalgos, index as libindex, lib
from pandas._libs.hashtable import duplicated_int64
-from pandas._typing import AnyArrayLike, DtypeObj, Label, Scalar, Shape
+from pandas._typing import AnyArrayLike, Label, Scalar, Shape
from pandas.compat.numpy import function as nv
from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError
from pandas.util._decorators import Appender, cache_readonly, doc
@@ -1684,6 +1684,10 @@ def unique(self, level=None):
level = self._get_level_number(level)
return self._get_level_values(level=level, unique=True)
+ def _to_safe_for_reshape(self):
+ """ convert to object if we are a categorical """
+ return self.set_levels([i._to_safe_for_reshape() for i in self.levels])
+
def to_frame(self, index=True, name=None):
"""
Create a DataFrame with the levels of the MultiIndex as columns.
@@ -2165,8 +2169,7 @@ def drop(self, codes, level=None, errors="raise"):
if isinstance(loc, int):
inds.append(loc)
elif isinstance(loc, slice):
- step = loc.step if loc.step is not None else 1
- inds.extend(range(loc.start, loc.stop, step))
+ inds.extend(range(loc.start, loc.stop))
elif com.is_bool_indexer(loc):
if self.lexsort_depth == 0:
warnings.warn(
@@ -2527,10 +2530,6 @@ def _get_values_for_loc(self, series: "Series", loc, key):
if is_scalar(loc):
return new_values
- if len(new_values) == 1 and not self.nlevels > 1:
- # If more than one level left, we can not return a scalar
- return new_values[0]
-
new_index = self[loc]
new_index = maybe_droplevels(new_index, key)
new_ser = series._constructor(new_values, index=new_index, name=series.name)
@@ -3079,11 +3078,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
# given the inputs and the codes/indexer, compute an indexer set
# if we have a provided indexer, then this need not consider
# the entire labels set
- if step is not None and step < 0:
- # Switch elements for negative step size
- start, stop = stop - 1, start - 1
- r = np.arange(start, stop, step)
+ r = np.arange(start, stop, step)
if indexer is not None and len(indexer) != len(codes):
# we have an indexer which maps the locations in the labels
@@ -3346,8 +3342,6 @@ def _reorder_indexer(
k_codes = k_codes[k_codes >= 0] # Filter absent keys
# True if the given codes are not ordered
need_sort = (k_codes[:-1] > k_codes[1:]).any()
- elif isinstance(k, slice) and k.step is not None and k.step < 0:
- need_sort = True
# Bail out if both index and seq are sorted
if not need_sort:
return indexer
@@ -3374,8 +3368,6 @@ def _reorder_indexer(
key_order_map[level_indexer] = np.arange(len(level_indexer))
new_order = key_order_map[self.codes[i][indexer]]
- elif isinstance(k, slice) and k.step is not None and k.step < 0:
- new_order = np.arange(n)[k][indexer]
elif isinstance(k, slice) and k.start is None and k.stop is None:
# slice(None) should not determine order GH#31330
new_order = np.ones((n,))[indexer]
@@ -3569,11 +3561,6 @@ def union(self, other, sort=None):
if len(other) == 0 or self.equals(other):
return self.rename(result_names)
- return self._union(other, sort=sort)
-
- def _union(self, other, sort):
- other, result_names = self._convert_can_do_setop(other)
-
# TODO: Index.union returns other when `len(self)` is 0.
if not is_object_dtype(other.dtype):
@@ -3588,9 +3575,6 @@ def _union(self, other, sort):
zip(*uniq_tuples), sortorder=0, names=result_names
)
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- return is_object_dtype(dtype)
-
def intersection(self, other, sort=False):
"""
Form the intersection of two MultiIndex objects.
@@ -3617,18 +3601,17 @@ def intersection(self, other, sort=False):
other, result_names = self._convert_can_do_setop(other)
if self.equals(other):
- if self.has_duplicates:
- return self.unique().rename(result_names)
- return self._get_reconciled_name_object(other)
-
- return self._intersection(other, sort=sort)
-
- def _intersection(self, other, sort=False):
- other, result_names = self._convert_can_do_setop(other)
+ return self.rename(result_names)
- if not self._is_comparable_dtype(other.dtype):
+ if not is_object_dtype(other.dtype):
# The intersection is empty
- return self[:0].rename(result_names)
+ # TODO: we have no tests that get here
+ return MultiIndex(
+ levels=self.levels,
+ codes=[[]] * self.nlevels,
+ names=result_names,
+ verify_integrity=False,
+ )
lvals = self._values
rvals = other._values
@@ -3636,12 +3619,10 @@ def _intersection(self, other, sort=False):
uniq_tuples = None # flag whether _inner_indexer was successful
if self.is_monotonic and other.is_monotonic:
try:
- inner_tuples = self._inner_indexer(lvals, rvals)[0]
- sort = False # inner_tuples is already sorted
+ uniq_tuples = self._inner_indexer(lvals, rvals)[0]
+ sort = False # uniq_tuples is already sorted
except TypeError:
pass
- else:
- uniq_tuples = algos.unique(inner_tuples)
if uniq_tuples is None:
other_uniq = set(rvals)
@@ -3732,14 +3713,16 @@ def _convert_can_do_setop(self, other):
if not isinstance(other, Index):
if len(other) == 0:
- return self[:0], self.names
+ other = MultiIndex(
+ levels=[[]] * self.nlevels,
+ codes=[[]] * self.nlevels,
+ verify_integrity=False,
+ )
else:
msg = "other must be a MultiIndex or a list of tuples"
try:
- other = MultiIndex.from_tuples(other, names=self.names)
- except (ValueError, TypeError) as err:
- # ValueError raised by tuples_to_object_array if we
- # have non-object dtype
+ other = MultiIndex.from_tuples(other)
+ except TypeError as err:
raise TypeError(msg) from err
else:
result_names = get_unanimous_names(self, other)
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index ed76e26a57634..12f61fc44582d 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -4,7 +4,7 @@
import numpy as np
from pandas._libs import index as libindex, lib
-from pandas._typing import Dtype, DtypeObj, Label
+from pandas._typing import Dtype, Label
from pandas.util._decorators import doc
from pandas.core.dtypes.cast import astype_nansafe
@@ -29,7 +29,7 @@
import pandas.core.common as com
from pandas.core.indexes.base import Index, maybe_extract_name
-_num_index_shared_docs = {}
+_num_index_shared_docs = dict()
class NumericIndex(Index):
@@ -148,10 +148,6 @@ def _convert_tolerance(self, tolerance, target):
)
return tolerance
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- # If we ever have BoolIndex or ComplexIndex, this may need to be tightened
- return is_numeric_dtype(dtype)
-
@classmethod
def _assert_safe_casting(cls, data, subarr):
"""
@@ -228,12 +224,7 @@ def _union(self, other, sort):
An Index instance can **only** contain hashable objects.
"""
-_int64_descr_args = {
- "klass": "Int64Index",
- "ltype": "integer",
- "dtype": "int64",
- "extra": "",
-}
+_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="")
class IntegerIndex(NumericIndex):
@@ -295,12 +286,9 @@ class Int64Index(IntegerIndex):
_default_dtype = np.dtype(np.int64)
-_uint64_descr_args = {
- "klass": "UInt64Index",
- "ltype": "unsigned integer",
- "dtype": "uint64",
- "extra": "",
-}
+_uint64_descr_args = dict(
+ klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra=""
+)
class UInt64Index(IntegerIndex):
@@ -326,12 +314,9 @@ def _convert_arr_indexer(self, keyarr):
return com.asarray_tuplesafe(keyarr, dtype=dtype)
-_float64_descr_args = {
- "klass": "Float64Index",
- "dtype": "float64",
- "ltype": "float",
- "extra": "",
-}
+_float64_descr_args = dict(
+ klass="Float64Index", dtype="float64", ltype="float", extra=""
+)
class Float64Index(NumericIndex):
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index 26bba4653007f..5dff07ee4c6dd 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -1,5 +1,5 @@
from datetime import datetime, timedelta
-from typing import Any
+from typing import Any, cast
import warnings
import numpy as np
@@ -43,7 +43,7 @@
from pandas.core.ops import get_op_result_name
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"})
+_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods"))
# --- Period index sketch
@@ -452,10 +452,13 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)
- if not self._should_compare(target):
- return self._get_indexer_non_comparable(target, method, unique=True)
-
if isinstance(target, PeriodIndex):
+ if not self._is_comparable_dtype(target.dtype):
+ # i.e. target.freq != self.freq
+ # No matches
+ no_matches = -1 * np.ones(self.shape, dtype=np.intp)
+ return no_matches
+
target = target._get_engine_target() # i.e. target.asi8
self_index = self._int64index
else:
@@ -636,19 +639,15 @@ def _setop(self, other, sort, opname: str):
def intersection(self, other, sort=False):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, _ = self._convert_can_do_setop(other)
+ other = ensure_index(other)
if self.equals(other):
return self._get_reconciled_name_object(other)
- return self._intersection(other, sort=sort)
-
- def _intersection(self, other, sort=False):
-
- if is_object_dtype(other.dtype):
+ elif is_object_dtype(other.dtype):
return self.astype("O").intersection(other, sort=sort)
- elif not self._is_comparable_dtype(other.dtype):
+ elif not is_dtype_equal(self.dtype, other.dtype):
# We can infer that the intersection is empty.
# assert_can_do_setop ensures that this is not just a mismatched freq
this = self[:0].astype("O")
@@ -660,14 +659,14 @@ def _intersection(self, other, sort=False):
def difference(self, other, sort=None):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
+ other = ensure_index(other)
if self.equals(other):
- return self[:0].rename(result_name)
-
- return self._difference(other, sort=sort)
+ # pass an empty PeriodArray with the appropriate dtype
- def _difference(self, other, sort):
+ # TODO: overload DatetimeLikeArrayMixin.__getitem__
+ values = cast(PeriodArray, self._data[:0])
+ return type(self)._simple_new(values, name=self.name)
if is_object_dtype(other):
return self.astype(object).difference(other).astype(self.dtype)
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index ec896d94a20ba..669bf115df104 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -482,11 +482,34 @@ def equals(self, other: object) -> bool:
# --------------------------------------------------------------------
# Set Operations
- def _intersection(self, other, sort=False):
+ def intersection(self, other, sort=False):
+ """
+ Form the intersection of two Index objects.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ sort : False or None, default False
+ Sort the resulting index if possible
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default to ``False`` to match the behaviour
+ from before 0.24.0.
+
+ Returns
+ -------
+ intersection : Index
+ """
+ self._validate_sort_keyword(sort)
+
+ if self.equals(other):
+ return self._get_reconciled_name_object(other)
if not isinstance(other, RangeIndex):
- # Int64Index
- return super()._intersection(other, sort=sort)
+ return super().intersection(other, sort=sort)
if not len(self) or not len(other):
return self._simple_new(_empty_range)
@@ -528,7 +551,7 @@ def _intersection(self, other, sort=False):
if sort is None:
new_index = new_index.sort_values()
- return new_index
+ return self._wrap_setop_result(other, new_index)
def _min_fitting_element(self, lower_limit: int) -> int:
"""Returns the smallest element greater than or equal to the limit"""
@@ -629,8 +652,6 @@ def _union(self, other, sort):
def difference(self, other, sort=None):
# optimized set operation if we have another RangeIndex
self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
if not isinstance(other, RangeIndex):
return super().difference(other, sort=sort)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index e7cf8cae28b88..6aa031af64833 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -259,11 +259,10 @@ def loc(self) -> "_LocIndexer":
e.g. ``[True, False, True]``.
- An alignable boolean Series. The index of the key will be aligned before
masking.
- - An alignable Index. The Index of the returned selection will be the input.
- A ``callable`` function with one argument (the calling Series or
DataFrame) and that returns valid output for indexing (one of the above)
- See more at :ref:`Selection by Label `.
+ See more at :ref:`Selection by Label `
Raises
------
@@ -333,14 +332,6 @@ def loc(self) -> "_LocIndexer":
max_speed shield
sidewinder 7 8
- Index (same behavior as ``df.reindex``)
-
- >>> df.loc[pd.Index(["cobra", "viper"], name="foo")]
- max_speed shield
- foo
- cobra 1 2
- viper 4 5
-
Conditional that returns a boolean Series
>>> df.loc[df['shield'] > 6]
@@ -672,12 +663,17 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None):
and not com.is_bool_indexer(key)
and all(is_hashable(k) for k in key)
):
- # GH#38148
- keys = self.obj.columns.union(key, sort=False)
-
- self.obj._mgr = self.obj._mgr.reindex_axis(
- keys, axis=0, copy=False, consolidate=False, only_slice=True
- )
+ for i, k in enumerate(key):
+ if k not in self.obj:
+ if value is None:
+ self.obj[k] = np.nan
+ elif is_array_like(value) and value.ndim == 2:
+ # GH#37964 have to select columnwise in case of array
+ self.obj[k] = value[:, i]
+ elif is_list_like(value):
+ self.obj[k] = value[i]
+ else:
+ self.obj[k] = value
def __setitem__(self, key, value):
if isinstance(key, tuple):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index fe07823a80783..74b5a184df95d 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -700,6 +700,7 @@ def convert(
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
+ coerce: bool = False,
) -> List["Block"]:
"""
attempt to coerce any object types to better types return a copy
@@ -1261,7 +1262,6 @@ def interpolate(
axis=axis,
inplace=inplace,
limit=limit,
- limit_area=limit_area,
downcast=downcast,
)
# validate the interp method
@@ -1288,7 +1288,6 @@ def _interpolate_with_fill(
axis: int = 0,
inplace: bool = False,
limit: Optional[int] = None,
- limit_area: Optional[str] = None,
downcast: Optional[str] = None,
) -> List["Block"]:
""" fillna but using the interpolate machinery """
@@ -1303,7 +1302,6 @@ def _interpolate_with_fill(
method=method,
axis=axis,
limit=limit,
- limit_area=limit_area,
)
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
@@ -1542,7 +1540,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
new_values = new_values.T[mask]
new_placement = new_placement[mask]
- blocks = [make_block(new_values, placement=new_placement)]
+ blocks = [self.make_block_same_class(new_values, placement=new_placement)]
return blocks, mask
def quantile(self, qs, interpolation="linear", axis: int = 0):
@@ -2396,28 +2394,6 @@ def quantile(self, qs, interpolation="linear", axis=0):
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype)
return self.make_block_same_class(aware, ndim=res_blk.ndim)
- def _check_ndim(self, values, ndim):
- """
- ndim inference and validation.
-
- This is overriden by the DatetimeTZBlock to check the case of 2D
- data (values.ndim == 2), which should only be allowed if ndim is
- also 2.
- The case of 1D array is still allowed with both ndim of 1 or 2, as
- if the case for other EAs. Therefore, we are only checking
- `values.ndim > ndim` instead of `values.ndim != ndim` as for
- consolidated blocks.
- """
- if ndim is None:
- ndim = values.ndim
-
- if values.ndim > ndim:
- raise ValueError(
- "Wrong number of dimensions. "
- f"values.ndim != ndim [{values.ndim} != {ndim}]"
- )
- return ndim
-
class TimeDeltaBlock(DatetimeLikeBlockMixin):
__slots__ = ()
@@ -2530,12 +2506,12 @@ def convert(
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
+ coerce: bool = False,
) -> List["Block"]:
"""
- attempt to cast any object types to better types return a copy of
+ attempt to coerce any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
"""
-
# operate column-by-column
def f(mask, val, idx):
shape = val.shape
@@ -2544,6 +2520,7 @@ def f(mask, val, idx):
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
+ coerce=coerce,
copy=copy,
)
if isinstance(values, np.ndarray):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 93ab207d8ce12..4cd7cc56144d9 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -267,7 +267,7 @@ def __getstate__(self):
"0.14.1": {
"axes": axes_array,
"blocks": [
- {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
+ dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
for b in self.blocks
],
}
@@ -636,6 +636,7 @@ def convert(
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
+ coerce: bool = False,
) -> "BlockManager":
return self.apply(
"convert",
@@ -643,6 +644,7 @@ def convert(
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
+ coerce=coerce,
)
def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager":
@@ -1236,8 +1238,6 @@ def reindex_axis(
limit=None,
fill_value=None,
copy: bool = True,
- consolidate: bool = True,
- only_slice: bool = False,
):
"""
Conform block manager to new index.
@@ -1248,13 +1248,7 @@ def reindex_axis(
)
return self.reindex_indexer(
- new_index,
- indexer,
- axis=axis,
- fill_value=fill_value,
- copy=copy,
- consolidate=consolidate,
- only_slice=only_slice,
+ new_index, indexer, axis=axis, fill_value=fill_value, copy=copy
)
def reindex_indexer(
@@ -1266,7 +1260,6 @@ def reindex_indexer(
allow_dups: bool = False,
copy: bool = True,
consolidate: bool = True,
- only_slice: bool = False,
) -> T:
"""
Parameters
@@ -1279,8 +1272,6 @@ def reindex_indexer(
copy : bool, default True
consolidate: bool, default True
Whether to consolidate inplace before reindexing.
- only_slice : bool, default False
- Whether to take views, not copies, along columns.
pandas-indexer with -1's only.
"""
@@ -1304,9 +1295,7 @@ def reindex_indexer(
raise IndexError("Requested axis not found in manager")
if axis == 0:
- new_blocks = self._slice_take_blocks_ax0(
- indexer, fill_value=fill_value, only_slice=only_slice
- )
+ new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value)
else:
new_blocks = [
blk.take_nd(
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index e374ba435a0bd..52536583b9b0d 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -1,13 +1,13 @@
"""
Routines for filling missing data.
"""
-from functools import partial
+
from typing import Any, List, Optional, Set, Union
import numpy as np
from pandas._libs import algos, lib
-from pandas._typing import ArrayLike, Axis, DtypeObj
+from pandas._typing import DtypeObj
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import infer_dtype_from_array
@@ -15,45 +15,57 @@
ensure_float64,
is_integer_dtype,
is_numeric_v_string_like,
+ is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.missing import isna
-def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray:
+def mask_missing(arr, values_to_mask):
"""
Return a masking array of same size/shape as arr
with entries equaling any member of values_to_mask set to True
-
- Parameters
- ----------
- arr : ArrayLike
- values_to_mask: list, tuple, or scalar
-
- Returns
- -------
- np.ndarray[bool]
"""
- # When called from Block.replace/replace_list, values_to_mask is a scalar
- # known to be holdable by arr.
- # When called from Series._single_replace, values_to_mask is tuple or list
dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
- values_to_mask = np.array(values_to_mask, dtype=dtype)
+
+ try:
+ values_to_mask = np.array(values_to_mask, dtype=dtype)
+
+ except Exception:
+ values_to_mask = np.array(values_to_mask, dtype=object)
na_mask = isna(values_to_mask)
nonna = values_to_mask[~na_mask]
- # GH 21977
- mask = np.zeros(arr.shape, dtype=bool)
+ mask = None
for x in nonna:
- if is_numeric_v_string_like(arr, x):
- # GH#29553 prevent numpy deprecation warnings
- pass
+ if mask is None:
+ if is_numeric_v_string_like(arr, x):
+ # GH#29553 prevent numpy deprecation warnings
+ mask = False
+ else:
+ mask = arr == x
+
+ # if x is a string and arr is not, then we get False and we must
+ # expand the mask to size arr.shape
+ if is_scalar(mask):
+ mask = np.zeros(arr.shape, dtype=bool)
else:
- mask |= arr == x
+ if is_numeric_v_string_like(arr, x):
+ # GH#29553 prevent numpy deprecation warnings
+ mask |= False
+ else:
+ mask |= arr == x
if na_mask.any():
- mask |= isna(arr)
+ if mask is None:
+ mask = isna(arr)
+ else:
+ mask |= isna(arr)
+
+ # GH 21977
+ if mask is None:
+ mask = np.zeros(arr.shape, dtype=bool)
return mask
@@ -528,92 +540,16 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat
return P(x)
-def _interpolate_with_limit_area(
- values: ArrayLike, method: str, limit: Optional[int], limit_area: Optional[str]
-) -> ArrayLike:
- """
- Apply interpolation and limit_area logic to values along a to-be-specified axis.
-
- Parameters
- ----------
- values: array-like
- Input array.
- method: str
- Interpolation method. Could be "bfill" or "pad"
- limit: int, optional
- Index limit on interpolation.
- limit_area: str
- Limit area for interpolation. Can be "inside" or "outside"
-
- Returns
- -------
- values: array-like
- Interpolated array.
- """
-
- invalid = isna(values)
-
- if not invalid.all():
- first = find_valid_index(values, "first")
- last = find_valid_index(values, "last")
-
- values = interpolate_2d(
- values,
- method=method,
- limit=limit,
- )
-
- if limit_area == "inside":
- invalid[first : last + 1] = False
- elif limit_area == "outside":
- invalid[:first] = invalid[last + 1 :] = False
-
- values[invalid] = np.nan
-
- return values
-
-
def interpolate_2d(
values,
- method: str = "pad",
- axis: Axis = 0,
- limit: Optional[int] = None,
- limit_area: Optional[str] = None,
+ method="pad",
+ axis=0,
+ limit=None,
):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
-
- Parameters
- ----------
- values: array-like
- Input array.
- method: str, default "pad"
- Interpolation method. Could be "bfill" or "pad"
- axis: 0 or 1
- Interpolation axis
- limit: int, optional
- Index limit on interpolation.
- limit_area: str, optional
- Limit area for interpolation. Can be "inside" or "outside"
-
- Returns
- -------
- values: array-like
- Interpolated array.
"""
- if limit_area is not None:
- return np.apply_along_axis(
- partial(
- _interpolate_with_limit_area,
- method=method,
- limit=limit,
- limit_area=limit_area,
- ),
- axis,
- values,
- )
-
orig_values = values
transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 88662a4fabed8..80c4cd5b44a92 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -12,6 +12,7 @@
from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar
from pandas.compat._optional import import_optional_dependency
+from pandas.core.dtypes.cast import maybe_upcast_putmask
from pandas.core.dtypes.common import (
get_dtype,
is_any_int_dtype,
@@ -283,7 +284,7 @@ def _get_values(
"""
# In _get_values is only called from within nanops, and in all cases
# with scalar fill_value. This guarantee is important for the
- # np.where call below
+ # maybe_upcast_putmask call below
assert is_scalar(fill_value)
values = extract_array(values, extract_numpy=True)
@@ -291,12 +292,10 @@ def _get_values(
dtype = values.dtype
- datetimelike = False
if needs_i8_conversion(values.dtype):
# changing timedelta64/datetime64 to int64 needs to happen after
# finding `mask` above
values = np.asarray(values.view("i8"))
- datetimelike = True
dtype_ok = _na_ok_dtype(dtype)
@@ -307,13 +306,13 @@ def _get_values(
)
if skipna and (mask is not None) and (fill_value is not None):
- if mask.any():
- if dtype_ok or datetimelike:
- values = values.copy()
- np.putmask(values, mask, fill_value)
- else:
- # np.where will promote if needed
- values = np.where(~mask, values, fill_value)
+ values = values.copy()
+ if dtype_ok and mask.any():
+ np.putmask(values, mask, fill_value)
+
+ # promote if needed
+ else:
+ values, _ = maybe_upcast_putmask(values, mask, fill_value)
# return a platform independent precision dtype
dtype_max = dtype
diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
index d8b5dba424cbf..2b159c607b0a0 100644
--- a/pandas/core/ops/__init__.py
+++ b/pandas/core/ops/__init__.py
@@ -311,10 +311,7 @@ def should_reindex_frame_op(
# TODO: any other cases we should handle here?
cols = left.columns.intersection(right.columns)
- # Intersection is always unique so we have to check the unique columns
- left_uniques = left.columns.unique()
- right_uniques = right.columns.unique()
- if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)):
+ if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)):
# TODO: is there a shortcut available when len(cols) == 0?
return True
diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
index 41d539564d91e..c855687552e82 100644
--- a/pandas/core/ops/array_ops.py
+++ b/pandas/core/ops/array_ops.py
@@ -30,7 +30,6 @@
from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
-from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.ops import missing
from pandas.core.ops.dispatch import should_extension_dispatch
from pandas.core.ops.invalid import invalid_comparison
@@ -176,8 +175,8 @@ def arithmetic_op(left: ArrayLike, right: Any, op):
# NB: We assume that extract_array has already been called
# on `left` and `right`.
- lvalues = ensure_wrapped_if_datetimelike(left)
- rvalues = ensure_wrapped_if_datetimelike(right)
+ lvalues = maybe_upcast_datetimelike_array(left)
+ rvalues = maybe_upcast_datetimelike_array(right)
rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape)
if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta):
@@ -207,7 +206,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
ndarray or ExtensionArray
"""
# NB: We assume extract_array has already been called on left and right
- lvalues = ensure_wrapped_if_datetimelike(left)
+ lvalues = maybe_upcast_datetimelike_array(left)
rvalues = right
rvalues = lib.item_from_zerodim(rvalues)
@@ -332,7 +331,7 @@ def fill_bool(x, left=None):
right = construct_1d_object_array_from_listlike(right)
# NB: We assume extract_array has already been called on left and right
- lvalues = ensure_wrapped_if_datetimelike(left)
+ lvalues = maybe_upcast_datetimelike_array(left)
rvalues = right
if should_extension_dispatch(lvalues, rvalues):
@@ -401,6 +400,31 @@ def get_array_op(op):
raise NotImplementedError(op_name)
+def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike:
+ """
+ If we have an ndarray that is either datetime64 or timedelta64, wrap in EA.
+
+ Parameters
+ ----------
+ obj : ndarray or ExtensionArray
+
+ Returns
+ -------
+ ndarray or ExtensionArray
+ """
+ if isinstance(obj, np.ndarray):
+ if obj.dtype.kind == "m":
+ from pandas.core.arrays import TimedeltaArray
+
+ return TimedeltaArray._from_sequence(obj)
+ if obj.dtype.kind == "M":
+ from pandas.core.arrays import DatetimeArray
+
+ return DatetimeArray._from_sequence(obj)
+
+ return obj
+
+
def _maybe_upcast_for_op(obj, shape: Shape):
"""
Cast non-pandas objects to pandas types to unify behavior of arithmetic
diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py
index 4866905d32b83..96a691da38b99 100644
--- a/pandas/core/ops/methods.py
+++ b/pandas/core/ops/methods.py
@@ -62,11 +62,11 @@ def add_flex_arithmetic_methods(cls):
flex_arith_method, flex_comp_method = _get_method_wrappers(cls)
new_methods = _create_methods(cls, flex_arith_method, flex_comp_method)
new_methods.update(
- {
- "multiply": new_methods["mul"],
- "subtract": new_methods["sub"],
- "divide": new_methods["div"],
- }
+ dict(
+ multiply=new_methods["mul"],
+ subtract=new_methods["sub"],
+ divide=new_methods["div"],
+ )
)
# opt out of bool flex methods for now
assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_"))
@@ -84,22 +84,22 @@ def _create_methods(cls, arith_method, comp_method):
new_methods = {}
new_methods.update(
- {
- "add": arith_method(operator.add),
- "radd": arith_method(radd),
- "sub": arith_method(operator.sub),
- "mul": arith_method(operator.mul),
- "truediv": arith_method(operator.truediv),
- "floordiv": arith_method(operator.floordiv),
- "mod": arith_method(operator.mod),
- "pow": arith_method(operator.pow),
- "rmul": arith_method(rmul),
- "rsub": arith_method(rsub),
- "rtruediv": arith_method(rtruediv),
- "rfloordiv": arith_method(rfloordiv),
- "rpow": arith_method(rpow),
- "rmod": arith_method(rmod),
- }
+ dict(
+ add=arith_method(operator.add),
+ radd=arith_method(radd),
+ sub=arith_method(operator.sub),
+ mul=arith_method(operator.mul),
+ truediv=arith_method(operator.truediv),
+ floordiv=arith_method(operator.floordiv),
+ mod=arith_method(operator.mod),
+ pow=arith_method(operator.pow),
+ rmul=arith_method(rmul),
+ rsub=arith_method(rsub),
+ rtruediv=arith_method(rtruediv),
+ rfloordiv=arith_method(rfloordiv),
+ rpow=arith_method(rpow),
+ rmod=arith_method(rmod),
+ )
)
new_methods["div"] = new_methods["truediv"]
new_methods["rdiv"] = new_methods["rtruediv"]
@@ -109,14 +109,14 @@ def _create_methods(cls, arith_method, comp_method):
new_methods["rdivmod"] = arith_method(rdivmod)
new_methods.update(
- {
- "eq": comp_method(operator.eq),
- "ne": comp_method(operator.ne),
- "lt": comp_method(operator.lt),
- "gt": comp_method(operator.gt),
- "le": comp_method(operator.le),
- "ge": comp_method(operator.ge),
- }
+ dict(
+ eq=comp_method(operator.eq),
+ ne=comp_method(operator.ne),
+ lt=comp_method(operator.lt),
+ gt=comp_method(operator.gt),
+ le=comp_method(operator.le),
+ ge=comp_method(operator.ge),
+ )
)
new_methods = {k.strip("_"): v for k, v in new_methods.items()}
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index f0b1228a5340c..a2f25bbcf38d3 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -43,7 +43,7 @@
from pandas.tseries.frequencies import is_subperiod, is_superperiod
from pandas.tseries.offsets import DateOffset, Day, Nano, Tick
-_shared_docs_kwargs: Dict[str, str] = {}
+_shared_docs_kwargs: Dict[str, str] = dict()
class Resampler(BaseGroupBy, ShallowMixin):
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
index f49aaee8bbc00..bcdb223415813 100644
--- a/pandas/core/reshape/melt.py
+++ b/pandas/core/reshape/melt.py
@@ -22,7 +22,7 @@
from pandas import DataFrame, Series
-@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
+@Appender(_shared_docs["melt"] % dict(caller="pd.melt(df, ", other="DataFrame.melt"))
def melt(
frame: "DataFrame",
id_vars=None,
@@ -42,7 +42,7 @@ def melt(
if value_name in frame.columns:
warnings.warn(
"This dataframe has a column name that matches the 'value_name' column "
- "name of the resulting Dataframe. "
+ "name of the resultiing Dataframe. "
"In the future this will raise an error, please set the 'value_name' "
"parameter of DataFrame.melt to a unique name.",
FutureWarning,
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 2c6cdb846221f..3b755c40721fb 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -114,8 +114,11 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec
# if we can groupby the rhs
# then we can get vastly better perf
- if all(item in right.columns for item in by):
+
+ try:
rby = right.groupby(by, sort=False)
+ except KeyError:
+ pass
for key, lhs in lby:
@@ -137,7 +140,9 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec
# make sure join keys are in the merged
# TODO, should merge_pieces do this?
- merged[by] = key
+ for k in by:
+ if k in merged:
+ merged[k] = key
pieces.append(merged)
@@ -271,20 +276,10 @@ def _merger(x, y):
if left_by is not None and right_by is not None:
raise ValueError("Can only group either left or right frames")
elif left_by is not None:
- if isinstance(left_by, str):
- left_by = [left_by]
- check = set(left_by).difference(left.columns)
- if len(check) != 0:
- raise KeyError(f"{check} not found in left columns")
result, _ = _groupby_and_merge(
left_by, on, left, right, lambda x, y: _merger(x, y)
)
elif right_by is not None:
- if isinstance(right_by, str):
- right_by = [right_by]
- check = set(right_by).difference(right.columns)
- if len(check) != 0:
- raise KeyError(f"{check} not found in right columns")
result, _ = _groupby_and_merge(
right_by, on, right, left, lambda x, y: _merger(y, x)
)
@@ -1276,9 +1271,7 @@ def _validate_specification(self):
raise MergeError("Must pass left_on or left_index=True")
else:
# use the common columns
- left_cols = self.left.columns
- right_cols = self.right.columns
- common_cols = left_cols.intersection(right_cols)
+ common_cols = self.left.columns.intersection(self.right.columns)
if len(common_cols) == 0:
raise MergeError(
"No common columns to perform merge on. "
@@ -1287,10 +1280,7 @@ def _validate_specification(self):
f"left_index={self.left_index}, "
f"right_index={self.right_index}"
)
- if (
- not left_cols.join(common_cols, how="inner").is_unique
- or not right_cols.join(common_cols, how="inner").is_unique
- ):
+ if not common_cols.is_unique:
raise MergeError(f"Data columns not unique: {repr(common_cols)}")
self.left_on = self.right_on = common_cols
elif self.on is not None:
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 40496a5b8671b..c1198cdfcda81 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -5,7 +5,6 @@
List,
Optional,
Sequence,
- Set,
Tuple,
Union,
cast,
@@ -268,13 +267,19 @@ def _add_margins(
margin_dummy = DataFrame(row_margin, columns=[key]).T
row_names = result.index.names
- # check the result column and leave floats
- for dtype in set(result.dtypes):
- cols = result.select_dtypes([dtype]).columns
- margin_dummy[cols] = margin_dummy[cols].apply(
- maybe_downcast_to_dtype, args=(dtype,)
- )
- result = result.append(margin_dummy)
+ try:
+ # check the result column and leave floats
+ for dtype in set(result.dtypes):
+ cols = result.select_dtypes([dtype]).columns
+ margin_dummy[cols] = margin_dummy[cols].apply(
+ maybe_downcast_to_dtype, args=(dtype,)
+ )
+ result = result.append(margin_dummy)
+ except TypeError:
+
+ # we cannot reshape, so coerce the axis
+ result.index = result.index._to_safe_for_reshape()
+ result = result.append(margin_dummy)
result.index.names = row_names
return result
@@ -322,7 +327,16 @@ def _all_key(key):
# we are going to mutate this, so need to copy!
piece = piece.copy()
- piece[all_key] = margin[key]
+ try:
+ piece[all_key] = margin[key]
+ except ValueError:
+ # we cannot reshape, so coerce the axis
+ piece.set_axis(
+ piece._get_axis(cat_axis)._to_safe_for_reshape(),
+ axis=cat_axis,
+ inplace=True,
+ )
+ piece[all_key] = margin[key]
table_pieces.append(piece)
margin_keys.append(all_key)
@@ -564,37 +578,29 @@ def crosstab(
b 0 1 0
c 0 0 0
"""
- if values is None and aggfunc is not None:
- raise ValueError("aggfunc cannot be used without values.")
-
- if values is not None and aggfunc is None:
- raise ValueError("values cannot be used without an aggfunc.")
-
index = com.maybe_make_list(index)
columns = com.maybe_make_list(columns)
+ rownames = _get_names(index, rownames, prefix="row")
+ colnames = _get_names(columns, colnames, prefix="col")
+
common_idx = None
pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
if pass_objs:
common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
- rownames = _get_names(index, rownames, prefix="row")
- colnames = _get_names(columns, colnames, prefix="col")
+ data: Dict = {}
+ data.update(zip(rownames, index))
+ data.update(zip(colnames, columns))
- # duplicate names mapped to unique names for pivot op
- (
- rownames_mapper,
- unique_rownames,
- colnames_mapper,
- unique_colnames,
- ) = _build_names_mapper(rownames, colnames)
+ if values is None and aggfunc is not None:
+ raise ValueError("aggfunc cannot be used without values.")
+
+ if values is not None and aggfunc is None:
+ raise ValueError("values cannot be used without an aggfunc.")
from pandas import DataFrame
- data = {
- **dict(zip(unique_rownames, index)),
- **dict(zip(unique_colnames, columns)),
- }
df = DataFrame(data, index=common_idx)
original_df_cols = df.columns
@@ -607,8 +613,8 @@ def crosstab(
table = df.pivot_table(
["__dummy__"],
- index=unique_rownames,
- columns=unique_colnames,
+ index=rownames,
+ columns=colnames,
margins=margins,
margins_name=margins_name,
dropna=dropna,
@@ -627,9 +633,6 @@ def crosstab(
table, normalize=normalize, margins=margins, margins_name=margins_name
)
- table = table.rename_axis(index=rownames_mapper, axis=0)
- table = table.rename_axis(columns=colnames_mapper, axis=1)
-
return table
@@ -728,57 +731,3 @@ def _get_names(arrs, names, prefix: str = "row"):
names = list(names)
return names
-
-
-def _build_names_mapper(
- rownames: List[str], colnames: List[str]
-) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]:
- """
- Given the names of a DataFrame's rows and columns, returns a set of unique row
- and column names and mappers that convert to original names.
-
- A row or column name is replaced if it is duplicate among the rows of the inputs,
- among the columns of the inputs or between the rows and the columns.
-
- Paramters
- ---------
- rownames: list[str]
- colnames: list[str]
-
- Returns
- -------
- Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
-
- rownames_mapper: dict[str, str]
- a dictionary with new row names as keys and original rownames as values
- unique_rownames: list[str]
- a list of rownames with duplicate names replaced by dummy names
- colnames_mapper: dict[str, str]
- a dictionary with new column names as keys and original column names as values
- unique_colnames: list[str]
- a list of column names with duplicate names replaced by dummy names
-
- """
-
- def get_duplicates(names):
- seen: Set = set()
- return {name for name in names if name not in seen}
-
- shared_names = set(rownames).intersection(set(colnames))
- dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
-
- rownames_mapper = {
- f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
- }
- unique_rownames = [
- f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
- ]
-
- colnames_mapper = {
- f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
- }
- unique_colnames = [
- f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
- ]
-
- return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
diff --git a/pandas/core/series.py b/pandas/core/series.py
index b20cf8eed9a2e..d493ac0a8c051 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -84,13 +84,7 @@
from pandas.core.generic import NDFrame
from pandas.core.indexers import deprecate_ndim_indexing, unpack_1tuple
from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
-from pandas.core.indexes.api import (
- CategoricalIndex,
- Float64Index,
- Index,
- MultiIndex,
- ensure_index,
-)
+from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index
import pandas.core.indexes.base as ibase
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import PeriodIndex
@@ -111,21 +105,21 @@
__all__ = ["Series"]
-_shared_doc_kwargs = {
- "axes": "index",
- "klass": "Series",
- "axes_single_arg": "{0 or 'index'}",
- "axis": """axis : {0 or 'index'}
+_shared_doc_kwargs = dict(
+ axes="index",
+ klass="Series",
+ axes_single_arg="{0 or 'index'}",
+ axis="""axis : {0 or 'index'}
Parameter needed for compatibility with DataFrame.""",
- "inplace": """inplace : boolean, default False
+ inplace="""inplace : boolean, default False
If True, performs operation inplace and returns None.""",
- "unique": "np.ndarray",
- "duplicated": "Series",
- "optional_by": "",
- "optional_mapper": "",
- "optional_labels": "",
- "optional_axis": "",
-}
+ unique="np.ndarray",
+ duplicated="Series",
+ optional_by="",
+ optional_mapper="",
+ optional_labels="",
+ optional_axis="",
+)
def _coerce_method(converter):
@@ -418,13 +412,7 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None:
labels = ensure_index(labels)
if labels._is_all_dates:
- deep_labels = labels
- if isinstance(labels, CategoricalIndex):
- deep_labels = labels.categories
-
- if not isinstance(
- deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)
- ):
+ if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
try:
labels = DatetimeIndex(labels)
# need to set here because we changed the index
@@ -916,8 +904,7 @@ def _get_values(self, indexer):
except ValueError:
# mpl compat if we look up e.g. ser[:, np.newaxis];
# see tests.series.timeseries.test_mpl_compat_hack
- # the asarray is needed to avoid returning a 2D DatetimeArray
- return np.asarray(self._values[indexer])
+ return self._values[indexer]
def _get_value(self, label, takeable: bool = False):
"""
@@ -1114,7 +1101,7 @@ def repeat(self, repeats, axis=None) -> "Series":
2 c
dtype: object
"""
- nv.validate_repeat(tuple(), {"axis": axis})
+ nv.validate_repeat(tuple(), dict(axis=axis))
new_index = self.index.repeat(repeats)
new_values = self._values.repeat(repeats)
return self._constructor(new_values, index=new_index).__finalize__(
@@ -4719,7 +4706,6 @@ def _convert_dtypes(
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
- convert_floating: bool = True,
) -> "Series":
input_series = self
if infer_objects:
@@ -4727,13 +4713,9 @@ def _convert_dtypes(
if is_object_dtype(input_series):
input_series = input_series.copy()
- if convert_string or convert_integer or convert_boolean or convert_floating:
+ if convert_string or convert_integer or convert_boolean:
inferred_dtype = convert_dtypes(
- input_series._values,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
+ input_series._values, convert_string, convert_integer, convert_boolean
)
try:
result = input_series.astype(inferred_dtype)
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index 3aeb3b664b27f..9de9d1f434a12 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -1,6 +1,6 @@
from typing import Dict
-_shared_docs: Dict[str, str] = {}
+_shared_docs: Dict[str, str] = dict()
_shared_docs[
"aggregate"
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 0a1cbc6de1cda..729f517c789a7 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -31,6 +31,7 @@
if TYPE_CHECKING:
from pandas import MultiIndex
+ from pandas.core.arrays import ExtensionArray
from pandas.core.indexes.base import Index
_INT64_MAX = np.iinfo(np.int64).max
@@ -390,7 +391,7 @@ def nargsort(
return indexer
-def nargminmax(values, method: str):
+def nargminmax(values: "ExtensionArray", method: str) -> int:
"""
Implementation of np.argmin/argmax but for ExtensionArray and which
handles missing values.
@@ -405,16 +406,20 @@ def nargminmax(values, method: str):
int
"""
assert method in {"argmax", "argmin"}
- func = np.argmax if method == "argmax" else np.argmin
- mask = np.asarray(isna(values))
- values = values._values_for_argsort()
+ mask = np.asarray(values.isna())
+ if mask.all():
+ # Use same exception message we would get from numpy
+ raise ValueError(f"attempt to get {method} of an empty sequence")
- idx = np.arange(len(values))
- non_nans = values[~mask]
- non_nan_idx = idx[~mask]
+ if method == "argmax":
+ # Use argsort with ascending=False so that if more than one entry
+ # achieves the maximum, we take the first such occurence.
+ sorters = values.argsort(ascending=False)
+ else:
+ sorters = values.argsort(ascending=True)
- return non_nan_idx[func(non_nans)]
+ return sorters[0]
def _ensure_key_mapped_multiindex(
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 2713b76189157..9d16beba669ca 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -26,7 +26,7 @@
from pandas.core.base import NoNewAttributesMixin
-_shared_docs: Dict[str, str] = {}
+_shared_docs: Dict[str, str] = dict()
_cpython_optimized_encoders = (
"utf-8",
"utf8",
@@ -1446,17 +1446,17 @@ def pad(self, width, side="left", fillchar=" "):
filled : Series/Index of objects.
"""
- @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
+ @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center"))
@forbid_nonstring_types(["bytes"])
def center(self, width, fillchar=" "):
return self.pad(width, side="both", fillchar=fillchar)
- @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
+ @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust"))
@forbid_nonstring_types(["bytes"])
def ljust(self, width, fillchar=" "):
return self.pad(width, side="right", fillchar=fillchar)
- @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
+ @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust"))
@forbid_nonstring_types(["bytes"])
def rjust(self, width, fillchar=" "):
return self.pad(width, side="left", fillchar=fillchar)
@@ -1790,11 +1790,9 @@ def encode(self, encoding, errors="strict"):
@Appender(
_shared_docs["str_strip"]
- % {
- "side": "left and right sides",
- "method": "strip",
- "position": "leading and trailing",
- }
+ % dict(
+ side="left and right sides", method="strip", position="leading and trailing"
+ )
)
@forbid_nonstring_types(["bytes"])
def strip(self, to_strip=None):
@@ -1803,7 +1801,7 @@ def strip(self, to_strip=None):
@Appender(
_shared_docs["str_strip"]
- % {"side": "left side", "method": "lstrip", "position": "leading"}
+ % dict(side="left side", method="lstrip", position="leading")
)
@forbid_nonstring_types(["bytes"])
def lstrip(self, to_strip=None):
@@ -1812,7 +1810,7 @@ def lstrip(self, to_strip=None):
@Appender(
_shared_docs["str_strip"]
- % {"side": "right side", "method": "rstrip", "position": "trailing"}
+ % dict(side="right side", method="rstrip", position="trailing")
)
@forbid_nonstring_types(["bytes"])
def rstrip(self, to_strip=None):
@@ -2414,11 +2412,11 @@ def extractall(self, pat, flags=0):
@Appender(
_shared_docs["find"]
- % {
- "side": "lowest",
- "method": "find",
- "also": "rfind : Return highest indexes in each strings.",
- }
+ % dict(
+ side="lowest",
+ method="find",
+ also="rfind : Return highest indexes in each strings.",
+ )
)
@forbid_nonstring_types(["bytes"])
def find(self, sub, start=0, end=None):
@@ -2431,11 +2429,11 @@ def find(self, sub, start=0, end=None):
@Appender(
_shared_docs["find"]
- % {
- "side": "highest",
- "method": "rfind",
- "also": "find : Return lowest indexes in each strings.",
- }
+ % dict(
+ side="highest",
+ method="rfind",
+ also="find : Return lowest indexes in each strings.",
+ )
)
@forbid_nonstring_types(["bytes"])
def rfind(self, sub, start=0, end=None):
@@ -2497,12 +2495,12 @@ def normalize(self, form):
@Appender(
_shared_docs["index"]
- % {
- "side": "lowest",
- "similar": "find",
- "method": "index",
- "also": "rindex : Return highest indexes in each strings.",
- }
+ % dict(
+ side="lowest",
+ similar="find",
+ method="index",
+ also="rindex : Return highest indexes in each strings.",
+ )
)
@forbid_nonstring_types(["bytes"])
def index(self, sub, start=0, end=None):
@@ -2515,12 +2513,12 @@ def index(self, sub, start=0, end=None):
@Appender(
_shared_docs["index"]
- % {
- "side": "highest",
- "similar": "rfind",
- "method": "rindex",
- "also": "index : Return lowest indexes in each strings.",
- }
+ % dict(
+ side="highest",
+ similar="rfind",
+ method="rindex",
+ also="index : Return lowest indexes in each strings.",
+ )
)
@forbid_nonstring_types(["bytes"])
def rindex(self, sub, start=0, end=None):
@@ -2655,24 +2653,18 @@ def len(self):
# isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
# _doc_args holds dict of strings to use in substituting casemethod docs
_doc_args: Dict[str, Dict[str, str]] = {}
- _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
- _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
- _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
- _doc_args["capitalize"] = {
- "type": "be capitalized",
- "method": "capitalize",
- "version": "",
- }
- _doc_args["swapcase"] = {
- "type": "be swapcased",
- "method": "swapcase",
- "version": "",
- }
- _doc_args["casefold"] = {
- "type": "be casefolded",
- "method": "casefold",
- "version": "\n .. versionadded:: 0.25.0\n",
- }
+ _doc_args["lower"] = dict(type="lowercase", method="lower", version="")
+ _doc_args["upper"] = dict(type="uppercase", method="upper", version="")
+ _doc_args["title"] = dict(type="titlecase", method="title", version="")
+ _doc_args["capitalize"] = dict(
+ type="be capitalized", method="capitalize", version=""
+ )
+ _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="")
+ _doc_args["casefold"] = dict(
+ type="be casefolded",
+ method="casefold",
+ version="\n .. versionadded:: 0.25.0\n",
+ )
@Appender(_shared_docs["casemethods"] % _doc_args["lower"])
@forbid_nonstring_types(["bytes"])
@@ -2852,15 +2844,15 @@ def casefold(self):
3 False
dtype: bool
"""
- _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
- _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
- _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
- _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
- _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
- _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
- _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
- _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
- _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
+ _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum")
+ _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha")
+ _doc_args["isdigit"] = dict(type="digits", method="isdigit")
+ _doc_args["isspace"] = dict(type="whitespace", method="isspace")
+ _doc_args["islower"] = dict(type="lowercase", method="islower")
+ _doc_args["isupper"] = dict(type="uppercase", method="isupper")
+ _doc_args["istitle"] = dict(type="titlecase", method="istitle")
+ _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric")
+ _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal")
# force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
isalnum = _map_and_wrap(
diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py
index ed920c174ea69..1dd005c1602a5 100644
--- a/pandas/core/util/numba_.py
+++ b/pandas/core/util/numba_.py
@@ -9,7 +9,7 @@
from pandas.errors import NumbaUtilError
GLOBAL_USE_NUMBA: bool = False
-NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = {}
+NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict()
def maybe_use_numba(engine: Optional[str]) -> bool:
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
index e6185f8ae0679..51a1e2102c273 100644
--- a/pandas/core/window/rolling.py
+++ b/pandas/core/window/rolling.py
@@ -50,6 +50,7 @@
from pandas.core.aggregation import aggregate
from pandas.core.base import DataError, SelectionMixin
+import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.groupby.base import GotItemMixin, ShallowMixin
from pandas.core.indexes.api import Index, MultiIndex
@@ -790,29 +791,22 @@ def _apply(
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")
- codes = self._groupby.grouper.codes
- levels = self._groupby.grouper.levels
-
- group_indices = self._groupby.grouper.indices.values()
- if group_indices:
- indexer = np.concatenate(list(group_indices))
- else:
- indexer = np.array([], dtype=np.intp)
- codes = [c.take(indexer) for c in codes]
-
- # if the index of the original dataframe needs to be preserved, append
- # this index (but reordered) to the codes/levels from the groupby
- if grouped_object_index is not None:
- idx = grouped_object_index.take(indexer)
- if not isinstance(idx, MultiIndex):
- idx = MultiIndex.from_arrays([idx])
- codes.extend(list(idx.codes))
- levels.extend(list(idx.levels))
-
- result_index = MultiIndex(
- levels, codes, names=result_index_names, verify_integrity=False
+ result_index_data = []
+ for key, values in self._groupby.grouper.indices.items():
+ for value in values:
+ data = [
+ *com.maybe_make_list(key),
+ *com.maybe_make_list(
+ grouped_object_index[value]
+ if grouped_object_index is not None
+ else []
+ ),
+ ]
+ result_index_data.append(tuple(data))
+
+ result_index = MultiIndex.from_tuples(
+ result_index_data, names=result_index_names
)
-
result.index = result_index
return result
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 9fede5180e727..8ec0a869c7042 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -7,6 +7,7 @@
from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper
import mmap
import os
+import pathlib
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast
from urllib.parse import (
urljoin,
@@ -175,8 +176,19 @@ def stringify_path(
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
- if isinstance(filepath_or_buffer, os.PathLike):
- filepath_or_buffer = filepath_or_buffer.__fspath__()
+ if hasattr(filepath_or_buffer, "__fspath__"):
+ # https://github.com/python/mypy/issues/1424
+ # error: Item "str" of "Union[str, Path, IO[str]]" has no attribute
+ # "__fspath__" [union-attr]
+ # error: Item "IO[str]" of "Union[str, Path, IO[str]]" has no attribute
+ # "__fspath__" [union-attr]
+ # error: Item "str" of "Union[str, Path, IO[bytes]]" has no attribute
+ # "__fspath__" [union-attr]
+ # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no
+ # attribute "__fspath__" [union-attr]
+ filepath_or_buffer = filepath_or_buffer.__fspath__() # type: ignore[union-attr]
+ elif isinstance(filepath_or_buffer, pathlib.Path):
+ filepath_or_buffer = str(filepath_or_buffer)
return _expand_user(filepath_or_buffer)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 626c3df196380..c519baa4c21da 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,17 +1,14 @@
import abc
import datetime
-import inspect
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
from typing import Any, Dict, Mapping, Union, cast
-import warnings
from pandas._config import config
from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
-from pandas.compat._optional import import_optional_dependency
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
@@ -102,32 +99,12 @@
of dtype conversion.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
- Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
+ Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
Engine compatibility :
-
- "xlrd" supports most old/new Excel file formats.
- "openpyxl" supports newer Excel file formats.
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
- "pyxlsb" supports Binary Excel files.
-
- .. versionchanged:: 1.2.0
- The engine `xlrd `_
- is no longer maintained, and is not supported with
- python >= 3.9. When ``engine=None``, the following logic will be
- used to determine the engine.
-
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
- then `odf `_ will be used.
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
- be used.
- - Otherwise if `openpyxl `_ is installed,
- then ``openpyxl`` will be used.
- - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
-
- Specifying ``engine="xlrd"`` will continue to be allowed for the
- indefinite future.
-
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
@@ -576,9 +553,6 @@ class ExcelWriter(metaclass=abc.ABCMeta):
Default is to use xlwt for xls, openpyxl for xlsx, odf for ods.
See DataFrame.to_excel for typical usage.
- The writer should be used as a context manager. Otherwise, call `close()` to save
- and close any opened file handles.
-
Parameters
----------
path : str or typing.BinaryIO
@@ -903,32 +877,13 @@ class ExcelFile:
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
- Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
+ Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
+ default ``xlrd``.
Engine compatibility :
-
- ``xlrd`` supports most old/new Excel file formats.
- ``openpyxl`` supports newer Excel file formats.
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
- ``pyxlsb`` supports Binary Excel files.
-
- .. versionchanged:: 1.2.0
-
- The engine `xlrd `_
- is no longer maintained, and is not supported with
- python >= 3.9. When ``engine=None``, the following logic will be
- used to determine the engine.
-
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
- then `odf `_ will be used.
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
- will be used.
- - Otherwise if `openpyxl `_ is installed,
- then ``openpyxl`` will be used.
- - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
-
- Specifying ``engine="xlrd"`` will continue to be allowed for the
- indefinite future.
"""
from pandas.io.excel._odfreader import ODFReader
@@ -947,59 +902,14 @@ def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
if engine is None:
- # Determine ext and use odf for ods stream/file
+ engine = "xlrd"
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
- ext = None
if _is_ods_stream(path_or_buffer):
engine = "odf"
else:
ext = os.path.splitext(str(path_or_buffer))[-1]
if ext == ".ods":
engine = "odf"
-
- if (
- import_optional_dependency(
- "xlrd", raise_on_missing=False, on_version="ignore"
- )
- is not None
- ):
- from xlrd import Book
-
- if isinstance(path_or_buffer, Book):
- engine = "xlrd"
-
- # GH 35029 - Prefer openpyxl except for xls files
- if engine is None:
- if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
- engine = "xlrd"
- elif (
- import_optional_dependency(
- "openpyxl", raise_on_missing=False, on_version="ignore"
- )
- is not None
- ):
- engine = "openpyxl"
- else:
- caller = inspect.stack()[1]
- if (
- caller.filename.endswith("pandas/io/excel/_base.py")
- and caller.function == "read_excel"
- ):
- stacklevel = 4
- else:
- stacklevel = 2
- warnings.warn(
- "The xlrd engine is no longer maintained and is not "
- "supported when using pandas with python >= 3.9. However, "
- "the engine xlrd will continue to be allowed for the "
- "indefinite future. Beginning with pandas 1.2.0, the "
- "openpyxl engine will be used if it is installed and the "
- "engine argument is not specified. Either install openpyxl "
- "or specify engine='xlrd' to silence this warning.",
- FutureWarning,
- stacklevel=stacklevel,
- )
- engine = "xlrd"
if engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")
diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py
index 9a725c15de61e..9ede7cd0c2b95 100644
--- a/pandas/io/excel/_xlwt.py
+++ b/pandas/io/excel/_xlwt.py
@@ -45,9 +45,7 @@ def save(self):
"""
Save workbook to disk.
"""
- if self.sheets:
- # fails when the ExcelWriter is just opened and then closed
- self.book.save(self.handles.handle)
+ self.book.save(self.handles.handle)
def write_cells(
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 6d14d6172aa6c..fbda78a1842ca 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -159,13 +159,13 @@ def _initialize_chunksize(self, chunksize: Optional[int]) -> int:
@property
def _number_format(self) -> Dict[str, Any]:
"""Dictionary used for storing number formatting settings."""
- return {
- "na_rep": self.na_rep,
- "float_format": self.float_format,
- "date_format": self.date_format,
- "quoting": self.quoting,
- "decimal": self.decimal,
- }
+ return dict(
+ na_rep=self.na_rep,
+ float_format=self.float_format,
+ date_format=self.date_format,
+ quoting=self.quoting,
+ decimal=self.decimal,
+ )
@property
def data_index(self) -> Index:
diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py
index f6f3571955e6e..0212fd6f695cb 100644
--- a/pandas/io/formats/latex.py
+++ b/pandas/io/formats/latex.py
@@ -153,11 +153,11 @@ def pad_empties(x):
break
return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
- gen = (pad_empties(i) for i in out)
+ out = (pad_empties(i) for i in out)
# Add empty spaces for each column level
clevels = self.frame.columns.nlevels
- out = [[" " * len(i[-1])] * clevels + i for i in gen]
+ out = [[" " * len(i[-1])] * clevels + i for i in out]
# Add the column names to the last index column
cnames = self.frame.columns.names
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
index 128e50d84657c..ac453839792f3 100644
--- a/pandas/io/formats/printing.py
+++ b/pandas/io/formats/printing.py
@@ -206,7 +206,7 @@ def as_escaped_string(
translate = escape_chars
escape_chars = list(escape_chars.keys())
else:
- escape_chars = escape_chars or ()
+ escape_chars = escape_chars or tuple()
result = str(thing)
for c in escape_chars:
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 4557c10927a15..0eeff44d0f74c 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -433,16 +433,16 @@ def format_attr(pair):
else:
table_attr += ' class="tex2jax_ignore"'
- return {
- "head": head,
- "cellstyle": cellstyle,
- "body": body,
- "uuid": uuid,
- "precision": precision,
- "table_styles": table_styles,
- "caption": caption,
- "table_attributes": table_attr,
- }
+ return dict(
+ head=head,
+ cellstyle=cellstyle,
+ body=body,
+ uuid=uuid,
+ precision=precision,
+ table_styles=table_styles,
+ caption=caption,
+ table_attributes=table_attr,
+ )
def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler":
"""
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4a2d4af62f3e9..334a3dab6c13a 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -794,8 +794,9 @@ def _data_to_frame(**kwargs):
# fill out elements of body that are "ragged"
_expand_elements(body)
- with TextParser(body, header=header, **kwargs) as tp:
- return tp.read()
+ tp = TextParser(body, header=header, **kwargs)
+ df = tp.read()
+ return df
_valid_parsers = {
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index da085d0d0eb2f..e1feb1aa3fada 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -437,10 +437,6 @@ def read_json(
This can only be passed if `lines=True`.
If this is None, the file will be read into memory all at once.
- .. versionchanged:: 1.2
-
- ``JsonReader`` is a context manager.
-
compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
@@ -559,8 +555,7 @@ def read_json(
if chunksize:
return json_reader
- with json_reader:
- return json_reader.read()
+ return json_reader.read()
class JsonReader(abc.Iterator):
@@ -752,12 +747,6 @@ def __next__(self):
self.close()
raise StopIteration
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
class Parser:
_split_keys: Tuple[str, ...]
@@ -1109,7 +1098,7 @@ def _process_converter(self, f, filt=None):
assert obj is not None # for mypy
needs_new_obj = False
- new_obj = {}
+ new_obj = dict()
for i, (col, c) in enumerate(obj.items()):
if filt(col, c):
new_data, result = f(col, c)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 8b1184df92eaf..a19b132a7891d 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -1,6 +1,5 @@
""" parquet compat """
-from distutils.version import LooseVersion
import io
import os
from typing import Any, AnyStr, Dict, List, Optional, Tuple
@@ -178,39 +177,10 @@ def write(
handles.close()
def read(
- self,
- path,
- columns=None,
- use_nullable_dtypes=False,
- storage_options: StorageOptions = None,
- **kwargs,
+ self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
kwargs["use_pandas_metadata"] = True
- to_pandas_kwargs = {}
- if use_nullable_dtypes:
- if LooseVersion(self.api.__version__) >= "0.16":
- import pandas as pd
-
- mapping = {
- self.api.int8(): pd.Int8Dtype(),
- self.api.int16(): pd.Int16Dtype(),
- self.api.int32(): pd.Int32Dtype(),
- self.api.int64(): pd.Int64Dtype(),
- self.api.uint8(): pd.UInt8Dtype(),
- self.api.uint16(): pd.UInt16Dtype(),
- self.api.uint32(): pd.UInt32Dtype(),
- self.api.uint64(): pd.UInt64Dtype(),
- self.api.bool_(): pd.BooleanDtype(),
- self.api.string(): pd.StringDtype(),
- }
- to_pandas_kwargs["types_mapper"] = mapping.get
- else:
- raise ValueError(
- "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
- f"({self.api.__version__} is installed"
- )
-
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
path,
kwargs.pop("filesystem", None),
@@ -220,7 +190,7 @@ def read(
try:
return self.api.parquet.read_table(
path_or_handle, columns=columns, **kwargs
- ).to_pandas(**to_pandas_kwargs)
+ ).to_pandas()
finally:
if handles is not None:
handles.close()
@@ -288,12 +258,6 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
- use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
- if use_nullable_dtypes:
- raise ValueError(
- "The 'use_nullable_dtypes' argument is not supported for the "
- "fastparquet engine"
- )
path = stringify_path(path)
parquet_kwargs = {}
handles = None
@@ -404,13 +368,7 @@ def to_parquet(
return None
-def read_parquet(
- path,
- engine: str = "auto",
- columns=None,
- use_nullable_dtypes: bool = False,
- **kwargs,
-):
+def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
@@ -439,15 +397,6 @@ def read_parquet(
'pyarrow' is unavailable.
columns : list, default=None
If not None, only these columns will be read from the file.
- use_nullable_dtypes : bool, default False
- If True, use dtypes that use ``pd.NA`` as missing value indicator
- for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
- As new dtypes are added that support ``pd.NA`` in the future, the
- output with this option will change to use those dtypes.
- Note: this is an experimental option, and behaviour (e.g. additional
- support dtypes) may change without notice.
-
- .. versionadded:: 1.2.0
**kwargs
Any additional kwargs are passed to the engine.
@@ -456,6 +405,4 @@ def read_parquet(
DataFrame
"""
impl = get_engine(engine)
- return impl.read(
- path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
- )
+ return impl.read(path, columns=columns, **kwargs)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 5b623c360c3ef..25e8d9acf4690 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -276,19 +276,11 @@
iterator : bool, default False
Return TextFileReader object for iteration or getting chunks with
``get_chunk()``.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
chunksize : int, optional
Return TextFileReader object for iteration.
See the `IO Tools docs
`_
for more information on ``iterator`` and ``chunksize``.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer' and
`filepath_or_buffer` is path-like, then detect compression from the
@@ -459,8 +451,12 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
if chunksize or iterator:
return parser
- with parser:
- return parser.read(nrows)
+ try:
+ data = parser.read(nrows)
+ finally:
+ parser.close()
+
+ return data
_parser_defaults = {
@@ -1078,12 +1074,6 @@ def get_chunk(self, size=None):
size = min(size, self.nrows - self._currow)
return self.read(nrows=size)
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
def _is_index_col(col):
return col is not None and col is not False
@@ -1891,11 +1881,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
# no attribute "mmap" [union-attr]
self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr]
- try:
- self._reader = parsers.TextReader(self.handles.handle, **kwds)
- except Exception:
- self.handles.close()
- raise
+ self._reader = parsers.TextReader(self.handles.handle, **kwds)
self.unnamed_cols = self._reader.unnamed_cols
passed_names = self.names is None
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 3fe251d300856..d7ee4acc2e670 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2037,7 +2037,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
val_kind = _ensure_decoded(self.kind)
values = _maybe_convert(values, val_kind, encoding, errors)
- kwargs = {}
+ kwargs = dict()
kwargs["name"] = _ensure_decoded(self.index_name)
if self.freq is not None:
@@ -3237,7 +3237,7 @@ def __init__(
self.non_index_axes = non_index_axes or []
self.values_axes = values_axes or []
self.data_columns = data_columns or []
- self.info = info or {}
+ self.info = info or dict()
self.nan_rep = nan_rep
@property
@@ -3446,7 +3446,7 @@ def get_attrs(self):
""" retrieve our attributes """
self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
self.data_columns = getattr(self.attrs, "data_columns", None) or []
- self.info = getattr(self.attrs, "info", None) or {}
+ self.info = getattr(self.attrs, "info", None) or dict()
self.nan_rep = getattr(self.attrs, "nan_rep", None)
self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
@@ -3596,7 +3596,7 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
if not isinstance(columns, (tuple, list)):
columns = [columns]
- kw = {}
+ kw = dict()
if optlevel is not None:
kw["optlevel"] = optlevel
if kind is not None:
@@ -3689,7 +3689,7 @@ def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
return []
axis, axis_labels = non_index_axes[0]
- info = self.info.get(axis, {})
+ info = self.info.get(axis, dict())
if info.get("type") == "MultiIndex" and data_columns:
raise ValueError(
f"cannot use a multi-index on axis [{axis}] with "
@@ -4071,7 +4071,7 @@ def create_description(
if expectedrows is None:
expectedrows = max(self.nrows_expected, 10000)
- d = {"name": "table", "expectedrows": expectedrows}
+ d = dict(name="table", expectedrows=expectedrows)
# description from the axes & values
d["description"] = {a.cname: a.typ for a in self.axes}
@@ -4458,9 +4458,9 @@ def read(
result = self._read_axes(where=where, start=start, stop=stop)
info = (
- self.info.get(self.non_index_axes[0][0], {})
+ self.info.get(self.non_index_axes[0][0], dict())
if len(self.non_index_axes)
- else {}
+ else dict()
)
inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
index 243218129fda6..3f0370209e9a8 100644
--- a/pandas/io/sas/sasreader.py
+++ b/pandas/io/sas/sasreader.py
@@ -26,12 +26,6 @@ def read(self, nrows=None):
def close(self):
pass
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
@overload
def read_sas(
@@ -91,17 +85,9 @@ def read_sas(
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
-
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
@@ -148,5 +134,4 @@ def read_sas(
if iterator or chunksize:
return reader
- with reader:
- return reader.read()
+ return reader.read()
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 6f296d3c8d92f..d97ba6183c955 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1464,7 +1464,7 @@ def _read_value_labels(self) -> None:
off = off[ii]
val = val[ii]
txt = self.path_or_buf.read(txtlen)
- self.value_label_dict[labname] = {}
+ self.value_label_dict[labname] = dict()
for i in range(n):
end = off[i + 1] if i < n - 1 else txtlen
self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end])
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index ae4fff7b495d0..64cd43c230f28 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -1,7 +1,7 @@
# TODO: Use the fact that axis can have units to simplify the process
import functools
-from typing import TYPE_CHECKING, Optional, cast
+from typing import TYPE_CHECKING, Optional
import numpy as np
@@ -26,7 +26,7 @@
if TYPE_CHECKING:
from matplotlib.axes import Axes
- from pandas import DatetimeIndex, Index, Series
+ from pandas import Index, Series
# ---------------------------------------------------------------------
# Plotting functions and monkey patches
@@ -243,7 +243,6 @@ def maybe_convert_index(ax: "Axes", data):
if freq is None:
# We only get here for DatetimeIndex
- data.index = cast("DatetimeIndex", data.index)
freq = data.index.inferred_freq
freq = to_offset(freq)
diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py
index f507c6d4f45fb..149389b936def 100644
--- a/pandas/tests/arithmetic/conftest.py
+++ b/pandas/tests/arithmetic/conftest.py
@@ -18,6 +18,18 @@ def id_func(x):
# ------------------------------------------------------------------
+@pytest.fixture(
+ params=[
+ ("foo", None, None),
+ ("Egon", "Venkman", None),
+ ("NCC1701D", "NCC1701D", "NCC1701D"),
+ ]
+)
+def names(request):
+ """
+ A 3-tuple of names, the first two for operands, the last for a result.
+ """
+ return request.param
@pytest.fixture(params=[1, np.array(1, dtype=np.int64)])
diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
index 092a3f0d4402f..0202337a4389a 100644
--- a/pandas/tests/arithmetic/test_timedelta64.py
+++ b/pandas/tests/arithmetic/test_timedelta64.py
@@ -822,7 +822,7 @@ def test_operators_timedelta64(self):
tm.assert_series_equal(rs, xp)
assert rs.dtype == "timedelta64[ns]"
- df = DataFrame({"A": v1})
+ df = DataFrame(dict(A=v1))
td = Series([timedelta(days=i) for i in range(3)])
assert td.dtype == "timedelta64[ns]"
diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
index 01de64568a011..1a4ab9799e8e5 100644
--- a/pandas/tests/arrays/boolean/test_arithmetic.py
+++ b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -5,7 +5,6 @@
import pandas as pd
import pandas._testing as tm
-from pandas.arrays import FloatingArray
@pytest.fixture
@@ -52,15 +51,13 @@ def test_sub(left_array, right_array):
def test_div(left_array, right_array):
+ # for now division gives a float numpy array
result = left_array / right_array
- expected = FloatingArray(
- np.array(
- [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
- dtype="float64",
- ),
- np.array([False, False, True, False, False, True, True, True, True]),
+ expected = np.array(
+ [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
+ dtype="float64",
)
- tm.assert_extension_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
index 0f8743489b412..7665c350e3443 100644
--- a/pandas/tests/arrays/boolean/test_function.py
+++ b/pandas/tests/arrays/boolean/test_function.py
@@ -85,13 +85,6 @@ def test_value_counts_na():
tm.assert_series_equal(result, expected)
-def test_value_counts_with_normalize():
- s = pd.Series([True, False, pd.NA], dtype="boolean")
- result = s.value_counts(normalize=True)
- expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
- tm.assert_series_equal(result, expected)
-
-
def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py
index 36ed790eff63c..cb0ba128c1fb7 100644
--- a/pandas/tests/arrays/categorical/test_missing.py
+++ b/pandas/tests/arrays/categorical/test_missing.py
@@ -62,13 +62,13 @@ def test_set_item_nan(self):
"fillna_kwargs, msg",
[
(
- {"value": 1, "method": "ffill"},
+ dict(value=1, method="ffill"),
"Cannot specify both 'value' and 'method'.",
),
- ({}, "Must specify a fill 'value' or 'method'."),
- ({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
+ (dict(), "Must specify a fill 'value' or 'method'."),
+ (dict(method="bad"), "Invalid fill method. Expecting .* bad"),
(
- {"value": Series([1, 2, 3, 4, "a"])},
+ dict(value=Series([1, 2, 3, 4, "a"])),
"Cannot setitem on a Categorical with a new category",
),
],
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
index ef95eac316397..baf60a363ad29 100644
--- a/pandas/tests/arrays/floating/test_function.py
+++ b/pandas/tests/arrays/floating/test_function.py
@@ -113,13 +113,6 @@ def test_value_counts_empty():
tm.assert_series_equal(result, expected)
-def test_value_counts_with_normalize():
- s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
- result = s.value_counts(normalize=True)
- expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
- tm.assert_series_equal(result, expected)
-
-
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index 4b8d95ae83e4f..cf382dd5e37e0 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -3,11 +3,9 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p20
-
import pandas as pd
import pandas._testing as tm
-from pandas.core.arrays import FloatingArray, integer_array
+from pandas.core.arrays import integer_array
import pandas.core.ops as ops
# Basic test for the arithmetic array ops
@@ -45,12 +43,13 @@ def test_sub(dtype):
def test_div(dtype):
+ # for now division gives a float numpy array
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)
result = a / b
- expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
- tm.assert_extension_array_equal(result, expected)
+ expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64")
+ tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
@@ -58,13 +57,10 @@ def test_divide_by_zero(zero, negative):
# https://github.com/pandas-dev/pandas/issues/27398
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
- expected = FloatingArray(
- np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
- np.array([False, False, False, True]),
- )
+ expected = np.array([np.nan, np.inf, -np.inf, np.nan])
if negative:
expected *= -1
- tm.assert_extension_array_equal(result, expected)
+ tm.assert_numpy_array_equal(result, expected)
def test_floordiv(dtype):
@@ -101,11 +97,8 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)
result = a ** np.nan
- expected = FloatingArray(
- np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
- np.array([False, False, False, True, False]),
- )
- tm.assert_extension_array_equal(result, expected)
+ expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
+ tm.assert_numpy_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
@@ -123,11 +116,8 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)
result = np.nan ** a
- expected = FloatingArray(
- np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
- np.array([False, False, True, False]),
- )
- tm.assert_extension_array_equal(result, expected)
+ expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
+ tm.assert_numpy_array_equal(result, expected)
def test_pow_array():
@@ -141,10 +131,10 @@ def test_pow_array():
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
- arr = pd.array([np.nan, np.nan], dtype="Int64")
+ arr = integer_array([np.nan, np.nan])
result = np.array([1.0, 2.0]) ** arr
- expected = pd.array([1.0, np.nan], dtype="Float64")
- tm.assert_extension_array_equal(result, expected)
+ expected = np.array([1.0, np.nan])
+ tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
@@ -206,19 +196,9 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
result = op(s, other)
expected = op(s.astype(float), other)
- expected = expected.astype("Float64")
# rfloordiv results in nan instead of inf
- if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20:
- # for numpy 1.20 https://github.com/numpy/numpy/pull/16161
- # updated floordiv, now matches our behavior defined in core.ops
- mask = (
- ((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
- )
- expected.array._data[mask] = np.nan
- # rmod results in NaN that wasn't NA in original nullable Series -> unmask it
- elif all_arithmetic_operators == "__rmod__":
- mask = (s == 0).fillna(False).to_numpy(bool)
- expected.array._mask[mask] = False
+ if all_arithmetic_operators == "__rfloordiv__":
+ expected[(expected == np.inf) | (expected == -np.inf)] = np.nan
tm.assert_series_equal(result, expected)
@@ -231,7 +211,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other):
s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
- assert result.dtype == "Float64"
+ assert result.dtype is np.dtype("float")
def test_cross_type_arithmetic():
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
index 521547cc7357d..9cdea1c71f109 100644
--- a/pandas/tests/arrays/integer/test_function.py
+++ b/pandas/tests/arrays/integer/test_function.py
@@ -127,14 +127,6 @@ def test_value_counts_empty():
tm.assert_series_equal(result, expected)
-def test_value_counts_with_normalize():
- # GH 33172
- s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
- result = s.value_counts(normalize=True)
- expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
- tm.assert_series_equal(result, expected)
-
-
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype):
diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py
index 1d2833c5da276..6de10fd896878 100644
--- a/pandas/tests/arrays/masked/test_arithmetic.py
+++ b/pandas/tests/arrays/masked/test_arithmetic.py
@@ -43,7 +43,11 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
for scalar in [scalar, data.dtype.type(scalar)]:
result = op(data, scalar)
expected = op(data, scalar_array)
- tm.assert_extension_array_equal(result, expected)
+ if isinstance(expected, ExtensionArray):
+ tm.assert_extension_array_equal(result, expected)
+ else:
+ # TODO div still gives float ndarray -> remove this once we have Float EA
+ tm.assert_numpy_array_equal(result, expected)
def test_array_NA(data, all_arithmetic_operators):
diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py
index 61f4e3e50d09d..c9f1dd7f589fc 100644
--- a/pandas/tests/arrays/sparse/test_arithmetics.py
+++ b/pandas/tests/arrays/sparse/test_arithmetics.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p20
-
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
@@ -118,15 +116,9 @@ def _check_logical_ops(self, a, b, a_dense, b_dense):
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
- self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
+ self, kind, mix, all_arithmetic_functions, fill_value, scalar
):
op = all_arithmetic_functions
-
- if not _np_version_under1p20:
- if op in [operator.floordiv, ops.rfloordiv]:
- mark = pytest.mark.xfail(strict=False, reason="GH#38172")
- request.node.add_marker(mark)
-
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind, fill_value=fill_value)
@@ -150,11 +142,15 @@ def test_float_scalar_comparison(self, kind):
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
- def test_float_same_index_without_nans(
- self, kind, mix, all_arithmetic_functions, request
- ):
+ def test_float_same_index(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
+ values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
+ rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
+
+ a = self._klass(values, kind=kind)
+ b = self._klass(rvalues, kind=kind)
+ self._check_numeric_ops(a, b, values, rvalues, mix, op)
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
@@ -163,24 +159,6 @@ def test_float_same_index_without_nans(
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
- def test_float_same_index_with_nans(
- self, kind, mix, all_arithmetic_functions, request
- ):
- # when sp_index are the same
- op = all_arithmetic_functions
-
- if not _np_version_under1p20:
- if op in [operator.floordiv, ops.rfloordiv]:
- mark = pytest.mark.xfail(strict=False, reason="GH#38172")
- request.node.add_marker(mark)
-
- values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
- rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
-
- a = self._klass(values, kind=kind)
- b = self._klass(rvalues, kind=kind)
- self._check_numeric_ops(a, b, values, rvalues, mix, op)
-
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
@@ -346,14 +324,9 @@ def test_bool_array_logical(self, kind, fill_value):
b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
- def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
+ def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
- if not _np_version_under1p20:
- if op in [operator.floordiv, ops.rfloordiv] and mix:
- mark = pytest.mark.xfail(strict=True, reason="GH#38172")
- request.node.add_marker(mark)
-
rdtype = "int64"
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py
index 992dff218415d..517dc4a2c3d8b 100644
--- a/pandas/tests/arrays/sparse/test_libsparse.py
+++ b/pandas/tests/arrays/sparse/test_libsparse.py
@@ -12,47 +12,42 @@
TEST_LENGTH = 20
-plain_case = {
- "xloc": [0, 7, 15],
- "xlen": [3, 5, 5],
- "yloc": [2, 9, 14],
- "ylen": [2, 3, 5],
- "intersect_loc": [2, 9, 15],
- "intersect_len": [1, 3, 4],
-}
-delete_blocks = {
- "xloc": [0, 5],
- "xlen": [4, 4],
- "yloc": [1],
- "ylen": [4],
- "intersect_loc": [1],
- "intersect_len": [3],
-}
-split_blocks = {
- "xloc": [0],
- "xlen": [10],
- "yloc": [0, 5],
- "ylen": [3, 7],
- "intersect_loc": [0, 5],
- "intersect_len": [3, 5],
-}
-skip_block = {
- "xloc": [10],
- "xlen": [5],
- "yloc": [0, 12],
- "ylen": [5, 3],
- "intersect_loc": [12],
- "intersect_len": [3],
-}
-
-no_intersect = {
- "xloc": [0, 10],
- "xlen": [4, 6],
- "yloc": [5, 17],
- "ylen": [4, 2],
- "intersect_loc": [],
- "intersect_len": [],
-}
+plain_case = dict(
+ xloc=[0, 7, 15],
+ xlen=[3, 5, 5],
+ yloc=[2, 9, 14],
+ ylen=[2, 3, 5],
+ intersect_loc=[2, 9, 15],
+ intersect_len=[1, 3, 4],
+)
+delete_blocks = dict(
+ xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3]
+)
+split_blocks = dict(
+ xloc=[0],
+ xlen=[10],
+ yloc=[0, 5],
+ ylen=[3, 7],
+ intersect_loc=[0, 5],
+ intersect_len=[3, 5],
+)
+skip_block = dict(
+ xloc=[10],
+ xlen=[5],
+ yloc=[0, 12],
+ ylen=[5, 3],
+ intersect_loc=[12],
+ intersect_len=[3],
+)
+
+no_intersect = dict(
+ xloc=[0, 10],
+ xlen=[4, 6],
+ yloc=[5, 17],
+ ylen=[4, 2],
+ intersect_loc=[],
+ intersect_len=[],
+)
def check_cases(_check_case):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index c70d55b07661d..9a1634380aaba 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -366,15 +366,6 @@ def test_astype_int(dtype, request):
tm.assert_extension_array_equal(result, expected)
-def test_astype_float(any_float_allowed_nullable_dtype):
- # Don't compare arrays (37974)
- ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string")
-
- result = ser.astype(any_float_allowed_nullable_dtype)
- expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype)
- tm.assert_series_equal(result, expected)
-
-
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
def test_reduce(skipna, dtype):
@@ -495,18 +486,6 @@ def test_value_counts_na(dtype, request):
tm.assert_series_equal(result, expected)
-def test_value_counts_with_normalize(dtype, request):
- if dtype == "arrow_string":
- reason = "TypeError: boolean value of NA is ambiguous"
- mark = pytest.mark.xfail(reason=reason)
- request.node.add_marker(mark)
-
- s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
- result = s.value_counts(normalize=True)
- expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
- tm.assert_series_equal(result, expected)
-
-
@pytest.mark.parametrize(
"values, expected",
[
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index c489aa5867632..159f52a4c7c25 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+import pytz
from pandas._libs import NaT, OutOfBoundsDatetime, Timestamp
from pandas.compat.numpy import np_version_under1p18
@@ -268,16 +269,18 @@ def test_searchsorted(self):
assert result == 10
@pytest.mark.parametrize("box", [None, "index", "series"])
- def test_searchsorted_castable_strings(self, arr1d, box, request):
+ def test_searchsorted_castable_strings(self, arr1d, box):
if isinstance(arr1d, DatetimeArray):
tz = arr1d.tz
- ts1, ts2 = arr1d[1:3]
- if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
+ if (
+ tz is not None
+ and tz is not pytz.UTC
+ and not isinstance(tz, pytz._FixedOffset)
+ ):
# If we have e.g. tzutc(), when we cast to string and parse
# back we get pytz.UTC, and then consider them different timezones
# so incorrectly raise.
- mark = pytest.mark.xfail(reason="timezone comparisons inconsistent")
- request.node.add_marker(mark)
+ pytest.xfail(reason="timezone comparisons inconsistent")
arr = arr1d
if box is None:
@@ -388,17 +391,19 @@ def test_setitem(self):
expected[:2] = expected[-2:]
tm.assert_numpy_array_equal(arr.asi8, expected)
- def test_setitem_strs(self, arr1d, request):
+ def test_setitem_strs(self, arr1d):
# Check that we parse strs in both scalar and listlike
if isinstance(arr1d, DatetimeArray):
tz = arr1d.tz
- ts1, ts2 = arr1d[-2:]
- if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
+ if (
+ tz is not None
+ and tz is not pytz.UTC
+ and not isinstance(tz, pytz._FixedOffset)
+ ):
# If we have e.g. tzutc(), when we cast to string and parse
# back we get pytz.UTC, and then consider them different timezones
# so incorrectly raise.
- mark = pytest.mark.xfail(reason="timezone comparisons inconsistent")
- request.node.add_marker(mark)
+ pytest.xfail(reason="timezone comparisons inconsistent")
# Setting list-like of strs
expected = arr1d.copy()
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index cc4aed5e4413d..a6fdb82e48197 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -316,35 +316,18 @@ def test_array_multiindex_raises():
TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"),
np.array([0, 3600000000000], dtype="m8[ns]"),
),
- # GH#26406 tz is preserved in Categorical[dt64tz]
- (
- pd.Categorical(pd.date_range("2016-01-01", periods=2, tz="US/Pacific")),
- np.array(
- [
- Timestamp("2016-01-01", tz="US/Pacific"),
- Timestamp("2016-01-02", tz="US/Pacific"),
- ]
- ),
- ),
],
)
-def test_to_numpy(array, expected, index_or_series_or_array, request):
- box = index_or_series_or_array
+def test_to_numpy(array, expected, index_or_series):
+ box = index_or_series
thing = box(array)
if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index:
pytest.skip(f"No index type for {array.dtype}")
- if array.dtype.name == "int64" and box is pd.array:
- mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
- request.node.add_marker(mark)
-
result = thing.to_numpy()
tm.assert_numpy_array_equal(result, expected)
- result = np.asarray(thing)
- tm.assert_numpy_array_equal(result, expected)
-
@pytest.mark.parametrize("as_series", [True, False])
@pytest.mark.parametrize(
diff --git a/pandas/tests/dtypes/cast/test_convert_objects.py b/pandas/tests/dtypes/cast/test_convert_objects.py
new file mode 100644
index 0000000000000..a28d554acd312
--- /dev/null
+++ b/pandas/tests/dtypes/cast/test_convert_objects.py
@@ -0,0 +1,12 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.cast import maybe_convert_objects
+
+
+@pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]])
+def test_maybe_convert_objects_copy(data):
+ arr = np.array(data)
+ out = maybe_convert_objects(arr)
+
+ assert arr is not out
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index ce6737db44195..2db9a9a403e1c 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -105,16 +105,16 @@ def test_period_dtype(self, dtype):
assert com.pandas_dtype(dtype) == dtype
-dtypes = {
- "datetime_tz": com.pandas_dtype("datetime64[ns, US/Eastern]"),
- "datetime": com.pandas_dtype("datetime64[ns]"),
- "timedelta": com.pandas_dtype("timedelta64[ns]"),
- "period": PeriodDtype("D"),
- "integer": np.dtype(np.int64),
- "float": np.dtype(np.float64),
- "object": np.dtype(object),
- "category": com.pandas_dtype("category"),
-}
+dtypes = dict(
+ datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"),
+ datetime=com.pandas_dtype("datetime64[ns]"),
+ timedelta=com.pandas_dtype("timedelta64[ns]"),
+ period=PeriodDtype("D"),
+ integer=np.dtype(np.int64),
+ float=np.dtype(np.float64),
+ object=np.dtype(object),
+ category=com.pandas_dtype("category"),
+)
@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x))
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index 872dd03768833..a419cb0dded79 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -199,14 +199,6 @@ def test_not_string(self):
# though CategoricalDtype has object kind, it cannot be string
assert not is_string_dtype(CategoricalDtype())
- def test_repr_range_categories(self):
- rng = pd.Index(range(3))
- dtype = CategoricalDtype(categories=rng, ordered=False)
- result = repr(dtype)
-
- expected = "CategoricalDtype(categories=range(0, 3), ordered=False)"
- assert result == expected
-
class TestDatetimeTZDtype(Base):
@pytest.fixture
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index c9ca5cb34d271..27fac95a16b7a 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -68,7 +68,7 @@ def coerce(request):
((1,), True, "tuple"),
(tuple(), True, "tuple-empty"),
({"a": 1}, True, "dict"),
- ({}, True, "dict-empty"),
+ (dict(), True, "dict-empty"),
({"a", 1}, "set", "set"),
(set(), "set", "set-empty"),
(frozenset({"a", 1}), "set", "frozenset"),
@@ -1489,7 +1489,7 @@ def test_datetimeindex_from_empty_datetime64_array():
def test_nan_to_nat_conversions():
df = DataFrame(
- {"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}
+ dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")})
)
df.iloc[3:6, :] = np.nan
result = df.loc[4, "B"]
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
index 922b3b94c16c1..12426a0c92c55 100644
--- a/pandas/tests/extension/arrow/test_bool.py
+++ b/pandas/tests/extension/arrow/test_bool.py
@@ -50,10 +50,6 @@ def test_view(self, data):
# __setitem__ does not work, so we only have a smoke-test
data.view()
- @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet")
- def test_contains(self, data, data_missing, nulls_fixture):
- super().test_contains(data, data_missing, nulls_fixture)
-
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
def test_from_dtype(self, data):
diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py
index d7997310dde3d..9ae4b01508d79 100644
--- a/pandas/tests/extension/base/interface.py
+++ b/pandas/tests/extension/base/interface.py
@@ -29,29 +29,6 @@ def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True
- def test_contains(self, data, data_missing, nulls_fixture):
- # GH-37867
- # Tests for membership checks. Membership checks for nan-likes is tricky and
- # the settled on rule is: `nan_like in arr` is True if nan_like is
- # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
-
- na_value = data.dtype.na_value
- # ensure data without missing values
- data = data[~data.isna()]
-
- # first elements are non-missing
- assert data[0] in data
- assert data_missing[0] in data_missing
-
- # check the presence of na_value
- assert na_value in data_missing
- assert na_value not in data
-
- if nulls_fixture is not na_value:
- # the data can never contain other nan-likes than na_value
- assert nulls_fixture not in data
- assert nulls_fixture not in data_missing
-
def test_memory_usage(self, data):
s = pd.Series(data)
result = s.memory_usage(index=False)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 1cc03d4f4f2bd..29a59cdefbd83 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -447,10 +447,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
@pytest.mark.parametrize(
"repeats, kwargs, error, msg",
[
- (2, {"axis": 1}, ValueError, "axis"),
- (-1, {}, ValueError, "negative"),
- ([1, 2], {}, ValueError, "shape"),
- (2, {"foo": "bar"}, TypeError, "'foo'"),
+ (2, dict(axis=1), ValueError, "axis"),
+ (-1, dict(), ValueError, "negative"),
+ ([1, 2], dict(), ValueError, "shape"),
+ (2, dict(foo="bar"), TypeError, "'foo'"),
],
)
def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
index a713550dafa5c..9ede9c7fbd0fd 100644
--- a/pandas/tests/extension/decimal/array.py
+++ b/pandas/tests/extension/decimal/array.py
@@ -155,14 +155,6 @@ def __setitem__(self, key, value):
def __len__(self) -> int:
return len(self._data)
- def __contains__(self, item) -> bool:
- if not isinstance(item, decimal.Decimal):
- return False
- elif item.is_nan():
- return self.isna().any()
- else:
- return super().__contains__(item)
-
@property
def nbytes(self) -> int:
n = len(self)
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index 3a5e49796c53b..74ca341e27bf8 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -143,13 +143,6 @@ def test_custom_asserts(self):
with pytest.raises(AssertionError, match=msg):
self.assert_frame_equal(a.to_frame(), b.to_frame())
- @pytest.mark.xfail(
- reason="comparison method not implemented for JSONArray (GH-37867)"
- )
- def test_contains(self, data):
- # GH-37867
- super().test_contains(data)
-
class TestConstructors(BaseJSON, base.BaseConstructorsTests):
@pytest.mark.skip(reason="not implemented constructor from dtype")
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
index ced7ea9261310..8acbeaf0b8170 100644
--- a/pandas/tests/extension/test_boolean.py
+++ b/pandas/tests/extension/test_boolean.py
@@ -130,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
elif op_name in ("__truediv__", "__rtruediv__"):
# combine with bools does not generate the correct result
# (numpy behaviour for div is to regard the bools as numeric)
- expected = s.astype(float).combine(other, op).astype("Float64")
+ expected = s.astype(float).combine(other, op)
if op_name == "__rpow__":
# for rpow, combine does not propagate NaN
expected[result.isna()] = np.nan
@@ -235,10 +235,6 @@ def test_searchsorted(self, data_for_sorting, as_series):
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)
- @pytest.mark.skip(reason="uses nullable integer")
- def test_value_counts_with_normalize(self, data):
- pass
-
def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
# override because there are only 2 unique values
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index d03a9ab6b2588..95f338cbc3240 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -87,28 +87,6 @@ def test_memory_usage(self, data):
# Is this deliberate?
super().test_memory_usage(data)
- def test_contains(self, data, data_missing, nulls_fixture):
- # GH-37867
- # na value handling in Categorical.__contains__ is deprecated.
- # See base.BaseInterFaceTests.test_contains for more details.
-
- na_value = data.dtype.na_value
- # ensure data without missing values
- data = data[~data.isna()]
-
- # first elements are non-missing
- assert data[0] in data
- assert data_missing[0] in data_missing
-
- # check the presence of na_value
- assert na_value in data_missing
- assert na_value not in data
-
- # Categoricals can contain other nan-likes than na_value
- if nulls_fixture is not na_value:
- assert nulls_fixture not in data
- assert nulls_fixture in data_missing # this line differs from super method
-
class TestConstructors(base.BaseConstructorsTests):
pass
diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
index c08c31e90fecc..00881178de1b4 100644
--- a/pandas/tests/extension/test_floating.py
+++ b/pandas/tests/extension/test_floating.py
@@ -184,10 +184,6 @@ def test_value_counts(self, all_data, dropna):
self.assert_series_equal(result, expected)
- @pytest.mark.skip(reason="uses nullable integer")
- def test_value_counts_with_normalize(self, data):
- pass
-
class TestCasting(base.BaseCastingTests):
pass
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
index b1461dcbd9e53..725533765ca2c 100644
--- a/pandas/tests/extension/test_integer.py
+++ b/pandas/tests/extension/test_integer.py
@@ -130,7 +130,10 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
expected = s.combine(other, op)
if op_name in ("__rtruediv__", "__truediv__", "__div__"):
- expected = expected.fillna(np.nan).astype("Float64")
+ expected = expected.fillna(np.nan).astype(float)
+ if op_name == "__rtruediv__":
+ # TODO reverse operators result in object dtype
+ result = result.astype(float)
elif op_name.startswith("__r"):
# TODO reverse operators result in object dtype
# see https://github.com/pandas-dev/pandas/issues/22024
@@ -221,10 +224,6 @@ def test_value_counts(self, all_data, dropna):
self.assert_series_equal(result, expected)
- @pytest.mark.skip(reason="uses nullable integer")
- def test_value_counts_with_normalize(self, data):
- pass
-
class TestCasting(base.BaseCastingTests):
pass
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index d49c4c5cf4889..db1940226e04e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -118,10 +118,6 @@ class TestMethods(base.BaseMethodsTests):
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)
- @pytest.mark.skip(reason="returns nullable")
- def test_value_counts_with_normalize(self, data):
- pass
-
class TestCasting(base.BaseCastingTests):
pass
diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py
index 9ec56c3429b22..15952f36b0fae 100644
--- a/pandas/tests/frame/apply/test_frame_apply.py
+++ b/pandas/tests/frame/apply/test_frame_apply.py
@@ -58,12 +58,6 @@ def test_apply(self, float_frame):
assert isinstance(df["c0"].dtype, CategoricalDtype)
assert isinstance(df["c1"].dtype, CategoricalDtype)
- def test_apply_axis1_with_ea(self):
- # GH#36785
- df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]})
- result = df.apply(lambda x: x, axis=1)
- tm.assert_frame_equal(result, df)
-
def test_apply_mixed_datetimelike(self):
# mixed datetimelike
# GH 7778
diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py
index 95ebaa4641d1b..73e60ff389038 100644
--- a/pandas/tests/frame/common.py
+++ b/pandas/tests/frame/common.py
@@ -5,7 +5,7 @@
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
- dtypes = {"A": "float32", "B": "float32", "C": "float16", "D": "float64"}
+ dtypes = dict(A="float32", B="float32", C="float16", D="float64")
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
@@ -21,7 +21,7 @@ def _check_mixed_float(df, dtype=None):
def _check_mixed_int(df, dtype=None):
- dtypes = {"A": "int32", "B": "uint64", "C": "uint8", "D": "int64"}
+ dtypes = dict(A="int32", B="uint64", C="uint8", D="int64")
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 49eb570c4ffe0..e33009f4597f0 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -241,24 +241,6 @@ def inc(x):
expected = DataFrame([[-1, inc], [inc, -1]])
tm.assert_frame_equal(df, expected)
- @pytest.mark.parametrize(
- "cols, values, expected",
- [
- (["C", "D", "D", "a"], [1, 2, 3, 4], 4), # with duplicates
- (["D", "C", "D", "a"], [1, 2, 3, 4], 4), # mixed order
- (["C", "B", "B", "a"], [1, 2, 3, 4], 4), # other duplicate cols
- (["C", "B", "a"], [1, 2, 3], 3), # no duplicates
- (["B", "C", "a"], [3, 2, 1], 1), # alphabetical order
- (["C", "a", "B"], [3, 2, 1], 2), # in the middle
- ],
- )
- def test_setitem_same_column(self, cols, values, expected):
- # GH 23239
- df = DataFrame([values], columns=cols)
- df["a"] = df["a"]
- result = df["a"].values[0]
- assert result == expected
-
def test_getitem_boolean(
self, float_string_frame, mixed_float_frame, mixed_int_frame, datetime_frame
):
@@ -732,7 +714,7 @@ def test_setitem_empty(self):
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("dtype", ["float", "int64"])
- @pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}])
+ @pytest.mark.parametrize("kwargs", [dict(), dict(index=[1]), dict(columns=["A"])])
def test_setitem_empty_frame_with_boolean(self, dtype, kwargs):
# see gh-10126
kwargs["dtype"] = dtype
@@ -1256,7 +1238,7 @@ def test_single_element_ix_dont_upcast(self, float_frame):
assert is_integer(result)
# GH 11617
- df = DataFrame({"a": [1.23]})
+ df = DataFrame(dict(a=[1.23]))
df["b"] = 666
result = df.loc[0, "b"]
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
index 884cb6c20b77e..e4a66ea9133dd 100644
--- a/pandas/tests/frame/indexing/test_setitem.py
+++ b/pandas/tests/frame/indexing/test_setitem.py
@@ -319,24 +319,6 @@ def test_setitem_bool_with_numeric_index(self, dtype):
tm.assert_index_equal(df.columns, expected_cols)
-class TestDataFrameSetItemWithExpansion:
- def test_setitem_listlike_views(self):
- # GH#38148
- df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]})
-
- # get one column as a view of df
- ser = df["a"]
-
- # add columns with list-like indexer
- df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]])
-
- # edit in place the first column to check view semantics
- df.iloc[0, 0] = 100
-
- expected = Series([100, 2, 3], name="a")
- tm.assert_series_equal(ser, expected)
-
-
class TestDataFrameSetItemSlicing:
def test_setitem_slice_position(self):
# GH#31469
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index acdb5726e4adb..3495247585236 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -356,11 +356,11 @@ def test_where_datetime(self):
# GH 3311
df = DataFrame(
- {
- "A": date_range("20130102", periods=5),
- "B": date_range("20130104", periods=5),
- "C": np.random.randn(5),
- }
+ dict(
+ A=date_range("20130102", periods=5),
+ B=date_range("20130104", periods=5),
+ C=np.random.randn(5),
+ )
)
stamp = datetime(2013, 1, 3)
@@ -618,7 +618,7 @@ def test_df_where_change_dtype(self):
tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("kwargs", [{}, {"other": None}])
+ @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)])
def test_df_where_with_category(self, kwargs):
# GH#16979
df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC"))
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index d79969eac0323..f05c90f37ea8a 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -563,7 +563,7 @@ def test_astype_empty_dtype_dict(self):
# issue mentioned further down in the following issue's thread
# https://github.com/pandas-dev/pandas/issues/33113
df = DataFrame()
- result = df.astype({})
+ result = df.astype(dict())
tm.assert_frame_equal(result, df)
assert result is not df
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index 934ad9eb8213a..08c4293323500 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -103,7 +103,6 @@ def test_combine_first_mixed_bug(self):
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
- def test_combine_first_same_as_in_update(self):
# gh 3016 (same as in update)
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
@@ -119,7 +118,6 @@ def test_combine_first_same_as_in_update(self):
df.loc[0, "A"] = 45
tm.assert_frame_equal(result, df)
- def test_combine_first_doc_example(self):
# doc example
df1 = DataFrame(
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
@@ -136,56 +134,38 @@ def test_combine_first_doc_example(self):
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
tm.assert_frame_equal(result, expected)
- def test_combine_first_return_obj_type_with_bools(self):
- # GH3552
-
+ # GH3552, return object dtype with bools
df1 = DataFrame(
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
)
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
- expected = Series([True, True, False], name=2, dtype=object)
-
- result_12 = df1.combine_first(df2)[2]
- tm.assert_series_equal(result_12, expected)
-
- result_21 = df2.combine_first(df1)[2]
- tm.assert_series_equal(result_21, expected)
-
- @pytest.mark.parametrize(
- "data1, data2, data_expected",
- (
- (
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- [None, None, None],
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- ),
- (
- [None, None, None],
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- ),
- (
- [datetime(2000, 1, 2), None, None],
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- ),
- (
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- [datetime(2000, 1, 2), None, None],
- [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
- ),
- ),
- )
- def test_combine_first_convert_datatime_correctly(
- self, data1, data2, data_expected
- ):
- # GH 3593
+ result = df1.combine_first(df2)[2]
+ expected = Series([True, True, False], name=2)
+ tm.assert_series_equal(result, expected)
- df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
- result = df1.combine_first(df2)
- expected = DataFrame({"a": data_expected})
- tm.assert_frame_equal(result, expected)
+ # GH 3593, converting datetime64[ns] incorrectly
+ df0 = DataFrame(
+ {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
+ )
+ df1 = DataFrame({"a": [None, None, None]})
+ df2 = df1.combine_first(df0)
+ tm.assert_frame_equal(df2, df0)
+
+ df2 = df0.combine_first(df1)
+ tm.assert_frame_equal(df2, df0)
+
+ df0 = DataFrame(
+ {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
+ )
+ df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
+ df2 = df1.combine_first(df0)
+ result = df0.copy()
+ result.iloc[0, :] = df1.iloc[0, :]
+ tm.assert_frame_equal(df2, result)
+
+ df2 = df0.combine_first(df1)
+ tm.assert_frame_equal(df2, df0)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
@@ -359,14 +339,9 @@ def test_combine_first_int(self):
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
- result_12 = df1.combine_first(df2)
- expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64")
- tm.assert_frame_equal(result_12, expected_12)
-
- result_21 = df2.combine_first(df1)
- expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64")
-
- tm.assert_frame_equal(result_21, expected_21)
+ res = df1.combine_first(df2)
+ tm.assert_frame_equal(res, df1)
+ assert res["a"].dtype == "int64"
@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
@@ -392,26 +367,6 @@ def test_combine_first_string_dtype_only_na(self):
tm.assert_frame_equal(result, expected)
-@pytest.mark.parametrize(
- "scalar1, scalar2",
- [
- (datetime(2020, 1, 1), datetime(2020, 1, 2)),
- (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
- (pd.Timedelta("89 days"), pd.Timedelta("60 min")),
- (pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
- ],
-)
-def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
- # GH28481
- na_value = nulls_fixture
- frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
- other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
-
- result = frame.combine_first(other)
- expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"])
- tm.assert_frame_equal(result, expected)
-
-
def test_combine_first_with_nan_multiindex():
# gh-36562
diff --git a/pandas/tests/frame/methods/test_convert.py b/pandas/tests/frame/methods/test_convert.py
index a00b2b5960884..50add248f9614 100644
--- a/pandas/tests/frame/methods/test_convert.py
+++ b/pandas/tests/frame/methods/test_convert.py
@@ -43,9 +43,9 @@ def test_convert_objects(self, float_string_frame):
converted["H"].astype("int32")
# mixed in a single column
- df = DataFrame({"s": Series([1, "na", 3, 4])})
+ df = DataFrame(dict(s=Series([1, "na", 3, 4])))
result = df._convert(datetime=True, numeric=True)
- expected = DataFrame({"s": Series([1, np.nan, 3, 4])})
+ expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
tm.assert_frame_equal(result, expected)
def test_convert_objects_no_conversion(self):
diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py
index b8328b43a6b13..8affcce478cf4 100644
--- a/pandas/tests/frame/methods/test_diff.py
+++ b/pandas/tests/frame/methods/test_diff.py
@@ -132,10 +132,10 @@ def test_diff_datetime_axis1(self, tz):
def test_diff_timedelta(self):
# GH#4533
df = DataFrame(
- {
- "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
- "value": [1.0, 2.0],
- }
+ dict(
+ time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
+ value=[1.0, 2.0],
+ )
)
res = df.diff()
diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
index b1d3890540bf9..79b152b677dfd 100644
--- a/pandas/tests/frame/methods/test_drop_duplicates.py
+++ b/pandas/tests/frame/methods/test_drop_duplicates.py
@@ -459,12 +459,3 @@ def test_drop_duplicates_series_vs_dataframe(keep):
dropped_frame = df[[column]].drop_duplicates(keep=keep)
dropped_series = df[column].drop_duplicates(keep=keep)
tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
-
-
-@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
-def test_drop_duplicates_non_boolean_ignore_index(arg):
- # GH#38274
- df = DataFrame({"a": [1, 2, 1, 3]})
- msg = '^For argument "ignore_index" expected type bool, received type .*.$'
- with pytest.raises(ValueError, match=msg):
- df.drop_duplicates(ignore_index=arg)
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
index b427611099be3..d59b70fa91a57 100644
--- a/pandas/tests/frame/methods/test_fillna.py
+++ b/pandas/tests/frame/methods/test_fillna.py
@@ -53,10 +53,10 @@ def test_fillna_mixed_float(self, mixed_float_frame):
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
mf.loc[mf.index[-10:], "A"] = np.nan
result = mf.fillna(value=0)
- _check_mixed_float(result, dtype={"C": None})
+ _check_mixed_float(result, dtype=dict(C=None))
result = mf.fillna(method="pad")
- _check_mixed_float(result, dtype={"C": None})
+ _check_mixed_float(result, dtype=dict(C=None))
def test_fillna_empty(self):
# empty frame (GH#2778)
@@ -262,7 +262,7 @@ def test_fillna_dtype_conversion(self):
tm.assert_frame_equal(result, expected)
# equiv of replace
- df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
+ df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0]))
for v in ["", 1, np.nan, 1.0]:
expected = df.replace(np.nan, v)
result = df.fillna(v)
diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py
index 1080d97b30987..857dd0ad7268b 100644
--- a/pandas/tests/frame/methods/test_rename.py
+++ b/pandas/tests/frame/methods/test_rename.py
@@ -76,8 +76,8 @@ def test_rename(self, float_frame):
@pytest.mark.parametrize(
"args,kwargs",
[
- ((ChainMap({"A": "a"}, {"B": "b"}),), {"axis": "columns"}),
- ((), {"columns": ChainMap({"A": "a"}, {"B": "b"})}),
+ ((ChainMap({"A": "a"}, {"B": "b"}),), dict(axis="columns")),
+ ((), dict(columns=ChainMap({"A": "a"}, {"B": "b"}))),
],
)
def test_rename_chainmap(self, args, kwargs):
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
index ab750bca7e069..8e59dd959ab57 100644
--- a/pandas/tests/frame/methods/test_replace.py
+++ b/pandas/tests/frame/methods/test_replace.py
@@ -1123,7 +1123,7 @@ def test_replace_series_no_regex(self):
tm.assert_series_equal(result, expected)
def test_replace_dict_tuple_list_ordering_remains_the_same(self):
- df = DataFrame({"A": [np.nan, 1]})
+ df = DataFrame(dict(A=[np.nan, 1]))
res1 = df.replace(to_replace={np.nan: 0, 1: -1e8})
res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0])
res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0])
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
index 00d4a4277a42f..5864b547a552b 100644
--- a/pandas/tests/frame/methods/test_reset_index.py
+++ b/pandas/tests/frame/methods/test_reset_index.py
@@ -618,7 +618,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex():
def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby():
# https://github.com/pandas-dev/pandas/issues/35657
- df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")})
+ df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01")))
df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum()
result = df.reset_index()
expected = DataFrame(
diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py
index b94f54a4819c0..be5f3ee9c8191 100644
--- a/pandas/tests/frame/methods/test_sort_values.py
+++ b/pandas/tests/frame/methods/test_sort_values.py
@@ -305,11 +305,11 @@ def test_sort_values_nat_values_in_int_column(self):
float_values = (2.0, -1.797693e308)
df = DataFrame(
- {"int": int_values, "float": float_values}, columns=["int", "float"]
+ dict(int=int_values, float=float_values), columns=["int", "float"]
)
df_reversed = DataFrame(
- {"int": int_values[::-1], "float": float_values[::-1]},
+ dict(int=int_values[::-1], float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0],
)
@@ -329,12 +329,12 @@ def test_sort_values_nat_values_in_int_column(self):
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(
- {"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values},
+ dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values),
columns=["datetime", "float"],
)
df_reversed = DataFrame(
- {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]},
+ dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0],
)
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
index 4cf0b1febf0af..7babc6853aef3 100644
--- a/pandas/tests/frame/methods/test_to_csv.py
+++ b/pandas/tests/frame/methods/test_to_csv.py
@@ -38,7 +38,7 @@
class TestDataFrameToCSV:
def read_csv(self, path, **kwargs):
- params = {"index_col": 0, "parse_dates": True}
+ params = dict(index_col=0, parse_dates=True)
params.update(**kwargs)
return pd.read_csv(path, **params)
@@ -248,10 +248,10 @@ def make_dtnat_arr(n, nnat=None):
# s3=make_dtnjat_arr(chunksize+5,0)
with tm.ensure_clean("1.csv") as pth:
- df = DataFrame({"a": s1, "b": s2})
+ df = DataFrame(dict(a=s1, b=s2))
df.to_csv(pth, chunksize=chunksize)
- recons = self.read_csv(pth).apply(to_datetime)
+ recons = self.read_csv(pth)._convert(datetime=True, coerce=True)
tm.assert_frame_equal(df, recons, check_names=False)
@pytest.mark.slow
@@ -260,7 +260,7 @@ def _do_test(
df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False
):
- kwargs = {"parse_dates": False}
+ kwargs = dict(parse_dates=False)
if cnlvl:
if rnlvl is not None:
kwargs["index_col"] = list(range(rnlvl))
@@ -291,7 +291,7 @@ def _to_uni(x):
recons.index = ix
recons = recons.iloc[:, rnlvl - 1 :]
- type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"}
+ type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O")
if r_dtype:
if r_dtype == "u": # unicode
r_dtype = "O"
@@ -738,7 +738,7 @@ def create_cols(name):
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
# dtype
- dtypes = {}
+ dtypes = dict()
for n, dtype in [
("float", np.float64),
("int", np.int64),
diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py
index 4d40f191a904b..d9c999c9119f4 100644
--- a/pandas/tests/frame/methods/test_to_records.py
+++ b/pandas/tests/frame/methods/test_to_records.py
@@ -131,7 +131,7 @@ def test_to_records_with_categorical(self):
[
# No dtypes --> default to array dtypes.
(
- {},
+ dict(),
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", " b"
diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py
index 1b32675ec2d35..300f4cd72573a 100644
--- a/pandas/tests/generic/test_duplicate_labels.py
+++ b/pandas/tests/generic/test_duplicate_labels.py
@@ -203,7 +203,7 @@ def test_concat(self, objs, kwargs):
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
allows_duplicate_labels=False
),
- {"left_index": True, "right_index": True},
+ dict(left_index=True, right_index=True),
False,
marks=not_implemented,
),
@@ -213,7 +213,7 @@ def test_concat(self, objs, kwargs):
allows_duplicate_labels=False
),
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
- {"left_index": True, "right_index": True},
+ dict(left_index=True, right_index=True),
False,
marks=not_implemented,
),
@@ -221,7 +221,7 @@ def test_concat(self, objs, kwargs):
(
pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
- {"left_index": True, "right_index": True},
+ dict(left_index=True, right_index=True),
True,
),
],
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 975cebe16dc55..151ec03662335 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -665,7 +665,7 @@ def test_apply_aggregating_timedelta_and_datetime():
df["time_delta_zero"] = df.datetime - df.datetime
result = df.groupby("clientid").apply(
lambda ddf: Series(
- {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
+ dict(clientid_age=ddf.time_delta_zero.min(), date=ddf.datetime.min())
)
)
expected = DataFrame(
@@ -784,7 +784,7 @@ def test_func(x):
def test_groupby_apply_return_empty_chunk():
# GH 22221: apply filter which returns some empty groups
- df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]})
+ df = DataFrame(dict(value=[0, 1], group=["filled", "empty"]))
groups = df.groupby("group")
result = groups.apply(lambda group: group[group.value != 1]["value"])
expected = Series(
@@ -1087,25 +1087,3 @@ def test_apply_by_cols_equals_apply_by_rows_transposed():
tm.assert_frame_equal(by_cols, by_rows.T)
tm.assert_frame_equal(by_cols, df)
-
-
-def test_apply_dropna_with_indexed_same():
- # GH 38227
-
- df = DataFrame(
- {
- "col": [1, 2, 3, 4, 5],
- "group": ["a", np.nan, np.nan, "b", "b"],
- },
- index=list("xxyxz"),
- )
- result = df.groupby("group").apply(lambda x: x)
- expected = DataFrame(
- {
- "col": [1, 4, 5],
- "group": ["a", "b", "b"],
- },
- index=list("xxz"),
- )
-
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index c915c95294ba0..e49e69a39b315 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -145,247 +145,203 @@ def test_builtins_apply(keys, f):
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
-class TestNumericOnly:
- # make sure that we are passing thru kwargs to our agg functions
-
- @pytest.fixture
- def df(self):
- # GH3668
- # GH5724
- df = DataFrame(
- {
- "group": [1, 1, 2],
- "int": [1, 2, 3],
- "float": [4.0, 5.0, 6.0],
- "string": list("abc"),
- "category_string": Series(list("abc")).astype("category"),
- "category_int": [7, 8, 9],
- "datetime": date_range("20130101", periods=3),
- "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
- "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
- },
- columns=[
- "group",
- "int",
- "float",
- "string",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
+def test_arg_passthru():
+ # make sure that we are passing thru kwargs
+ # to our agg functions
+
+ # GH3668
+ # GH5724
+ df = DataFrame(
+ {
+ "group": [1, 1, 2],
+ "int": [1, 2, 3],
+ "float": [4.0, 5.0, 6.0],
+ "string": list("abc"),
+ "category_string": Series(list("abc")).astype("category"),
+ "category_int": [7, 8, 9],
+ "datetime": pd.date_range("20130101", periods=3),
+ "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
+ "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
+ },
+ columns=[
+ "group",
+ "int",
+ "float",
+ "string",
+ "category_string",
+ "category_int",
+ "datetime",
+ "datetimetz",
+ "timedelta",
+ ],
+ )
+
+ expected_columns_numeric = Index(["int", "float", "category_int"])
+
+ # mean / median
+ expected = DataFrame(
+ {
+ "category_int": [7.5, 9],
+ "float": [4.5, 6.0],
+ "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
+ "int": [1.5, 3],
+ "datetime": [
+ Timestamp("2013-01-01 12:00:00"),
+ Timestamp("2013-01-03 00:00:00"),
],
- )
- return df
-
- @pytest.mark.parametrize("method", ["mean", "median"])
- def test_averages(self, df, method):
- # mean / median
- expected_columns_numeric = Index(["int", "float", "category_int"])
-
- gb = df.groupby("group")
- expected = DataFrame(
- {
- "category_int": [7.5, 9],
- "float": [4.5, 6.0],
- "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
- "int": [1.5, 3],
- "datetime": [
- Timestamp("2013-01-01 12:00:00"),
- Timestamp("2013-01-03 00:00:00"),
- ],
- "datetimetz": [
- Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
- Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
- ],
- },
- index=Index([1, 2], name="group"),
- columns=[
- "int",
- "float",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
+ "datetimetz": [
+ Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
+ Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
],
- )
+ },
+ index=Index([1, 2], name="group"),
+ columns=["int", "float", "category_int", "datetime", "datetimetz", "timedelta"],
+ )
- result = getattr(gb, method)(numeric_only=False)
+ for attr in ["mean", "median"]:
+ result = getattr(df.groupby("group"), attr)()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+ result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)
- expected_columns = expected.columns
+ # TODO: min, max *should* handle
+ # categorical (ordered) dtype
+ expected_columns = Index(
+ [
+ "int",
+ "float",
+ "string",
+ "category_int",
+ "datetime",
+ "datetimetz",
+ "timedelta",
+ ]
+ )
+ for attr in ["min", "max"]:
+ result = getattr(df.groupby("group"), attr)()
+ tm.assert_index_equal(result.columns, expected_columns)
- self._check(df, method, expected_columns, expected_columns_numeric)
+ result = getattr(df.groupby("group"), attr)(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_extrema(self, df, method):
- # TODO: min, max *should* handle
- # categorical (ordered) dtype
+ expected_columns = Index(
+ [
+ "int",
+ "float",
+ "string",
+ "category_string",
+ "category_int",
+ "datetime",
+ "datetimetz",
+ "timedelta",
+ ]
+ )
+ for attr in ["first", "last"]:
+ result = getattr(df.groupby("group"), attr)()
+ tm.assert_index_equal(result.columns, expected_columns)
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
+ result = getattr(df.groupby("group"), attr)(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- self._check(df, method, expected_columns, expected_columns_numeric)
+ expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
- @pytest.mark.parametrize("method", ["first", "last"])
- def test_first_last(self, df, method):
+ result = df.groupby("group").sum()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
+ result = df.groupby("group").sum(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- self._check(df, method, expected_columns, expected_columns_numeric)
+ expected_columns = Index(["int", "float", "category_int"])
+ for attr in ["prod", "cumprod"]:
+ result = getattr(df.groupby("group"), attr)()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["sum", "cumsum"])
- def test_sum_cumsum(self, df, method):
+ result = getattr(df.groupby("group"), attr)(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- expected_columns_numeric = Index(["int", "float", "category_int"])
- expected_columns = Index(
- ["int", "float", "string", "category_int", "timedelta"]
- )
- if method == "cumsum":
- # cumsum loses string
- expected_columns = Index(["int", "float", "category_int", "timedelta"])
+ # like min, max, but don't include strings
+ expected_columns = Index(
+ ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
+ )
+ for attr in ["cummin", "cummax"]:
+ result = getattr(df.groupby("group"), attr)()
+ # GH 15561: numeric_only=False set by default like min/max
+ tm.assert_index_equal(result.columns, expected_columns)
- self._check(df, method, expected_columns, expected_columns_numeric)
+ result = getattr(df.groupby("group"), attr)(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- @pytest.mark.parametrize("method", ["prod", "cumprod"])
- def test_prod_cumprod(self, df, method):
+ expected_columns = Index(["int", "float", "category_int", "timedelta"])
- expected_columns = Index(["int", "float", "category_int"])
- expected_columns_numeric = expected_columns
+ result = getattr(df.groupby("group"), "cumsum")()
+ tm.assert_index_equal(result.columns, expected_columns_numeric)
- self._check(df, method, expected_columns, expected_columns_numeric)
+ result = getattr(df.groupby("group"), "cumsum")(numeric_only=False)
+ tm.assert_index_equal(result.columns, expected_columns)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- def test_cummin_cummax(self, df, method):
- # like min, max, but don't include strings
- expected_columns = Index(
- ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
- )
- # GH#15561: numeric_only=False set by default like min/max
- expected_columns_numeric = expected_columns
+def test_non_cython_api():
- self._check(df, method, expected_columns, expected_columns_numeric)
+ # GH5610
+ # non-cython calls should not include the grouper
- def _check(self, df, method, expected_columns, expected_columns_numeric):
- gb = df.groupby("group")
+ df = DataFrame(
+ [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"]
+ )
+ g = df.groupby("A")
+ gni = df.groupby("A", as_index=False)
- result = getattr(gb, method)()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
+ # mad
+ expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
+ expected.index.name = "A"
+ result = g.mad()
+ tm.assert_frame_equal(result, expected)
- result = getattr(gb, method)(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
+ expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
+ result = gni.mad()
+ tm.assert_frame_equal(result, expected)
+ # describe
+ expected_index = Index([1, 3], name="A")
+ expected_col = pd.MultiIndex(
+ levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
+ codes=[[0] * 8, list(range(8))],
+ )
+ expected = DataFrame(
+ [
+ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+ [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+ ],
+ index=expected_index,
+ columns=expected_col,
+ )
+ result = g.describe()
+ tm.assert_frame_equal(result, expected)
-class TestGroupByNonCythonPaths:
- # GH#5610 non-cython calls should not include the grouper
- # Tests for code not expected to go through cython paths.
+ expected = pd.concat(
+ [
+ df[df.A == 1].describe().unstack().to_frame().T,
+ df[df.A == 3].describe().unstack().to_frame().T,
+ ]
+ )
+ expected.index = Index([0, 1])
+ result = gni.describe()
+ tm.assert_frame_equal(result, expected)
- @pytest.fixture
- def df(self):
- df = DataFrame(
- [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
- columns=["A", "B", "C"],
- )
- return df
-
- @pytest.fixture
- def gb(self, df):
- gb = df.groupby("A")
- return gb
-
- @pytest.fixture
- def gni(self, df):
- gni = df.groupby("A", as_index=False)
- return gni
-
- # TODO: non-unique columns, as_index=False
- def test_idxmax(self, gb):
- # object dtype so idxmax goes through _aggregate_item_by_item
- # GH#5610
- # non-cython calls should not include the grouper
- expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
- expected.index.name = "A"
- result = gb.idxmax()
- tm.assert_frame_equal(result, expected)
-
- def test_idxmin(self, gb):
- # object dtype so idxmax goes through _aggregate_item_by_item
- # GH#5610
- # non-cython calls should not include the grouper
- expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
- expected.index.name = "A"
- result = gb.idxmin()
- tm.assert_frame_equal(result, expected)
-
- def test_any(self, gb):
- expected = DataFrame(
- [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
- )
- expected.index.name = "A"
- result = gb.any()
- tm.assert_frame_equal(result, expected)
-
- def test_mad(self, gb, gni):
- # mad
- expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
- expected.index.name = "A"
- result = gb.mad()
- tm.assert_frame_equal(result, expected)
-
- expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
- result = gni.mad()
- tm.assert_frame_equal(result, expected)
-
- def test_describe(self, df, gb, gni):
- # describe
- expected_index = Index([1, 3], name="A")
- expected_col = pd.MultiIndex(
- levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
- codes=[[0] * 8, list(range(8))],
- )
- expected = DataFrame(
- [
- [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
- [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
- ],
- index=expected_index,
- columns=expected_col,
- )
- result = gb.describe()
- tm.assert_frame_equal(result, expected)
+ # any
+ expected = DataFrame(
+ [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
+ )
+ expected.index.name = "A"
+ result = g.any()
+ tm.assert_frame_equal(result, expected)
- expected = pd.concat(
- [
- df[df.A == 1].describe().unstack().to_frame().T,
- df[df.A == 3].describe().unstack().to_frame().T,
- ]
- )
- expected.index = Index([0, 1])
- result = gni.describe()
- tm.assert_frame_equal(result, expected)
+ # idxmax
+ expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
+ expected.index.name = "A"
+ result = g.idxmax()
+ tm.assert_frame_equal(result, expected)
def test_cython_api2():
@@ -539,27 +495,6 @@ def test_idxmin_idxmax_returns_int_types(func, values):
tm.assert_frame_equal(result, expected)
-def test_idxmin_idxmax_axis1():
- df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
- df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
-
- gb = df.groupby("A")
-
- res = gb.idxmax(axis=1)
-
- alt = df.iloc[:, 1:].idxmax(axis=1)
- indexer = res.index.get_level_values(1)
-
- tm.assert_series_equal(alt[indexer], res.droplevel("A"))
-
- df["E"] = pd.date_range("2016-01-01", periods=10)
- gb2 = df.groupby("A")
-
- msg = "reduction operation 'argmax' not allowed for this dtype"
- with pytest.raises(TypeError, match=msg):
- gb2.idxmax(axis=1)
-
-
def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
@@ -757,14 +692,14 @@ def test_cummin(numpy_dtypes_for_minmax):
tm.assert_frame_equal(result, expected)
# GH 15561
- df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
+ df = DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)
# GH 15635
- df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
+ df = DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)
@@ -813,14 +748,14 @@ def test_cummax(numpy_dtypes_for_minmax):
tm.assert_frame_equal(result, expected)
# GH 15561
- df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
+ df = DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)
# GH 15635
- df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
+ df = DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 7c179a79513fa..78c438fa11a0e 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -149,11 +149,11 @@ def test_inconsistent_return_type():
# GH5592
# inconsistent return type
df = DataFrame(
- {
- "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
- "B": Series(np.arange(7), dtype="int64"),
- "C": date_range("20130101", periods=7),
- }
+ dict(
+ A=["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
+ B=Series(np.arange(7), dtype="int64"),
+ C=date_range("20130101", periods=7),
+ )
)
def f(grp):
@@ -257,7 +257,7 @@ def test_len():
assert len(grouped) == expected
# issue 11016
- df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
+ df = DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
assert len(df.groupby("a")) == 0
assert len(df.groupby("b")) == 3
assert len(df.groupby(["a", "b"])) == 3
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 1d2208592a06d..4aefb73bf912c 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -307,8 +307,9 @@ def test_groupby_levels_and_columns(self):
# reset_index changes columns dtype to object
by_columns = df.reset_index().groupby(idx_names).mean()
- # without casting, by_columns.columns is object-dtype
- by_columns.columns = by_columns.columns.astype(np.int64)
+ tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
+
+ by_columns.columns = Index(by_columns.columns, dtype=np.int64)
tm.assert_frame_equal(by_levels, by_columns)
def test_groupby_categorical_index_and_columns(self, observed):
diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
index 26b3af4234be1..699cd88b5c53c 100644
--- a/pandas/tests/groupby/test_nth.py
+++ b/pandas/tests/groupby/test_nth.py
@@ -101,26 +101,6 @@ def test_first_last_with_None(method):
tm.assert_frame_equal(result, df)
-@pytest.mark.parametrize("method", ["first", "last"])
-@pytest.mark.parametrize(
- "df, expected",
- [
- (
- DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
- DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
- ),
- (
- DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
- DataFrame({"value": [None]}, index=Index(["a"], name="id")),
- ),
- ],
-)
-def test_first_last_with_None_expanded(method, df, expected):
- # GH 32800, 38286
- result = getattr(df.groupby("id"), method)()
- tm.assert_frame_equal(result, expected)
-
-
def test_first_last_nth_dtypes(df_mixed_floats):
df = df_mixed_floats.copy()
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py
index 76fc82c6288eb..bd6d33c59a48a 100644
--- a/pandas/tests/groupby/test_quantile.py
+++ b/pandas/tests/groupby/test_quantile.py
@@ -157,7 +157,7 @@ def test_quantile_raises():
def test_quantile_out_of_bounds_q_raises():
# https://github.com/pandas-dev/pandas/issues/27470
- df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
+ df = DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6)))
g = df.groupby([0, 0, 0, 1, 1, 1])
with pytest.raises(ValueError, match="Got '50.0' instead"):
g.quantile(50)
@@ -169,7 +169,7 @@ def test_quantile_out_of_bounds_q_raises():
def test_quantile_missing_group_values_no_segfaults():
# GH 28662
data = np.array([1.0, np.nan, 1.0])
- df = DataFrame({"key": data, "val": range(3)})
+ df = DataFrame(dict(key=data, val=range(3)))
# Random segfaults; would have been guaranteed in loop
grp = df.groupby("key")
@@ -254,26 +254,3 @@ def test_groupby_timedelta_quantile():
index=Index([1, 2], name="group"),
)
tm.assert_frame_equal(result, expected)
-
-
-def test_columns_groupby_quantile():
- # GH 33795
- df = DataFrame(
- np.arange(12).reshape(3, -1),
- index=list("XYZ"),
- columns=pd.Series(list("ABAB"), name="col"),
- )
- result = df.groupby("col", axis=1).quantile(q=[0.8, 0.2])
- expected = DataFrame(
- [
- [1.6, 0.4, 2.6, 1.4],
- [5.6, 4.4, 6.6, 5.4],
- [9.6, 8.4, 10.6, 9.4],
- ],
- index=list("XYZ"),
- columns=Index(
- [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
- ),
- )
-
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
index ef6b4ae4836f8..3461bf6e10662 100644
--- a/pandas/tests/groupby/test_rank.py
+++ b/pandas/tests/groupby/test_rank.py
@@ -42,10 +42,7 @@ def test_rank_apply():
@pytest.mark.parametrize(
"vals",
[
- np.array([2, 2, 8, 2, 6], dtype=dtype)
- for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
- ]
- + [
+ [2, 2, 8, 2, 6],
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
@@ -53,29 +50,7 @@ def test_rank_apply():
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
],
- [
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-08", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-06", tz="US/Pacific"),
- ],
- [
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-08") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-06") - pd.Timestamp(0),
- ],
- [
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-08").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-06").to_period("D"),
- ],
],
- ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,pct,exp",
@@ -104,12 +79,7 @@ def test_rank_apply():
)
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
-
- orig_vals = vals
- vals = list(vals) * len(grps)
- if isinstance(orig_vals, np.ndarray):
- vals = np.array(vals, dtype=orig_vals.dtype)
-
+ vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
@@ -172,10 +142,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
@pytest.mark.parametrize(
"vals",
[
- np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
- for dtype in ["f8", "f4", "f2"]
- ]
- + [
+ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
@@ -186,38 +153,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
np.nan,
np.nan,
],
- [
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- np.nan,
- pd.Timestamp("2018-01-08", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-06", tz="US/Pacific"),
- np.nan,
- np.nan,
- ],
- [
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- np.nan,
- pd.Timestamp("2018-01-08") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-06") - pd.Timestamp(0),
- np.nan,
- np.nan,
- ],
- [
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- np.nan,
- pd.Timestamp("2018-01-08").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-06").to_period("D"),
- np.nan,
- np.nan,
- ],
],
- ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,pct,exp",
@@ -410,12 +346,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
)
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
key = np.repeat(grps, len(vals))
-
- orig_vals = vals
- vals = list(vals) * len(grps)
- if isinstance(orig_vals, np.ndarray):
- vals = np.array(vals, dtype=orig_vals.dtype)
-
+ vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index 28095c0b0c39f..2340168415382 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -651,7 +651,7 @@ def test_groupby_first_datetime64(self):
def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
- df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
+ df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5)))
expected = df.groupby("A")["A"].apply(lambda x: x.max())
result = df.groupby("A")["A"].max()
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
index c5d454baa7e7b..c86cb4532bc26 100644
--- a/pandas/tests/groupby/test_value_counts.py
+++ b/pandas/tests/groupby/test_value_counts.py
@@ -65,13 +65,9 @@ def rebuild_index(df):
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
return df
- kwargs = {
- "normalize": normalize,
- "sort": sort,
- "ascending": ascending,
- "dropna": dropna,
- "bins": bins,
- }
+ kwargs = dict(
+ normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, bins=bins
+ )
gr = df.groupby(keys, sort=isort)
left = gr["3rd"].value_counts(**kwargs)
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 8acd051fbc643..b4e023f569844 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -228,7 +228,7 @@ def test_transform_dtype():
def test_transform_bug():
# GH 5712
# transforming on a datetime column
- df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
+ df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5)))
result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False))
expected = Series(np.arange(5, 0, step=-1), name="B")
tm.assert_series_equal(result, expected)
@@ -251,7 +251,7 @@ def test_transform_numeric_to_boolean():
def test_transform_datetime_to_timedelta():
# GH 15429
# transforming a datetime to timedelta
- df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
+ df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5)))
expected = Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A")
# this does date math without changing result type in transform
@@ -442,7 +442,7 @@ def test_transform_coercion():
# 14457
# when we are transforming be sure to not coerce
# via assignment
- df = DataFrame({"A": ["a", "a"], "B": [0, 1]})
+ df = DataFrame(dict(A=["a", "a"], B=[0, 1]))
g = df.groupby("A")
expected = g.transform(np.mean)
@@ -456,37 +456,30 @@ def test_groupby_transform_with_int():
# floats
df = DataFrame(
- {
- "A": [1, 1, 1, 2, 2, 2],
- "B": Series(1, dtype="float64"),
- "C": Series([1, 2, 3, 1, 2, 3], dtype="float64"),
- "D": "foo",
- }
+ dict(
+ A=[1, 1, 1, 2, 2, 2],
+ B=Series(1, dtype="float64"),
+ C=Series([1, 2, 3, 1, 2, 3], dtype="float64"),
+ D="foo",
+ )
)
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
expected = DataFrame(
- {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")}
+ dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64"))
)
tm.assert_frame_equal(result, expected)
# int case
- df = DataFrame(
- {
- "A": [1, 1, 1, 2, 2, 2],
- "B": 1,
- "C": [1, 2, 3, 1, 2, 3],
- "D": "foo",
- }
- )
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo"))
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
- expected = DataFrame({"B": np.nan, "C": [-1, 0, 1, -1, 0, 1]})
+ expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
tm.assert_frame_equal(result, expected)
# int that needs float conversion
s = Series([2, 3, 4, 10, 5, -1])
- df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
+ df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo"))
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
@@ -494,12 +487,12 @@ def test_groupby_transform_with_int():
s1 = (s1 - s1.mean()) / s1.std()
s2 = s.iloc[3:6]
s2 = (s2 - s2.mean()) / s2.std()
- expected = DataFrame({"B": np.nan, "C": concat([s1, s2])})
+ expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
tm.assert_frame_equal(result, expected)
# int downcasting
result = df.groupby("A").transform(lambda x: x * 2 / 2)
- expected = DataFrame({"B": 1, "C": [2, 3, 4, 10, 5, -1]})
+ expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
tm.assert_frame_equal(result, expected)
@@ -666,11 +659,11 @@ def test_cython_transform_frame(op, args, targop):
# group by values, index level, columns
for df in [df, df2]:
for gb_target in [
- {"by": labels},
- {"level": 0},
- {"by": "string"},
- ]: # {"by": 'string_missing'}]:
- # {"by": ['int','string']}]:
+ dict(by=labels),
+ dict(level=0),
+ dict(by="string"),
+ ]: # dict(by='string_missing')]:
+ # dict(by=['int','string'])]:
gb = df.groupby(**gb_target)
# allowlisted methods set the selection before applying
@@ -993,7 +986,7 @@ def test_transform_absent_categories(func):
x_vals = [1]
x_cats = range(2)
y = [1]
- df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y})
+ df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y))
result = getattr(df.y.groupby(df.x), func)()
expected = df.y
tm.assert_series_equal(result, expected)
@@ -1012,7 +1005,7 @@ def test_ffill_not_in_axis(func, key, val):
def test_transform_invalid_name_raises():
# GH#27486
- df = DataFrame({"a": [0, 1, 1, 2]})
+ df = DataFrame(dict(a=[0, 1, 1, 2]))
g = df.groupby(["a", "b", "b", "c"])
with pytest.raises(ValueError, match="not a valid function name"):
g.transform("some_arbitrary_name")
@@ -1032,8 +1025,7 @@ def test_transform_invalid_name_raises():
"obj",
[
DataFrame(
- {"a": [0, 0, 0, 1, 1, 1], "b": range(6)},
- index=["A", "B", "C", "D", "E", "F"],
+ dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"]
),
Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]),
],
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
index ddcb3c5b87ebc..6413b110dff2e 100644
--- a/pandas/tests/indexes/base_class/test_setops.py
+++ b/pandas/tests/indexes/base_class/test_setops.py
@@ -141,7 +141,7 @@ def test_intersection_str_dates(self, sort):
@pytest.mark.parametrize(
"index2,expected_arr",
- [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])],
+ [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])],
)
def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort):
# non-monotonic non-unique
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
index 3bab57e1d265e..2e03c00638a5c 100644
--- a/pandas/tests/indexes/categorical/test_category.py
+++ b/pandas/tests/indexes/categorical/test_category.py
@@ -57,10 +57,10 @@ def test_append(self):
expected = CategoricalIndex(list("aabbcaca"), categories=categories)
tm.assert_index_equal(result, expected, exact=True)
- # invalid objects -> cast to object via concat_compat
- result = ci.append(Index(["a", "d"]))
- expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"])
- tm.assert_index_equal(result, expected, exact=True)
+ # invalid objects
+ msg = "cannot append a non-category item to a CategoricalIndex"
+ with pytest.raises(TypeError, match=msg):
+ ci.append(Index(["a", "d"]))
# GH14298 - if base object is not categorical -> coerce to object
result = Index(["c", "a"]).append(ci)
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index d098e5b639f25..e250d8cf1b326 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -73,10 +73,7 @@ def test_shift(self):
# GH8083 test the base class for shift
idx = self.create_index()
- msg = (
- f"This method is only implemented for DatetimeIndex, PeriodIndex and "
- f"TimedeltaIndex; Got type {type(idx).__name__}"
- )
+ msg = f"Not supported for type {type(idx).__name__}"
with pytest.raises(NotImplementedError, match=msg):
idx.shift(1)
with pytest.raises(NotImplementedError, match=msg):
diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py
index 789510b452969..2657fc817ec3a 100644
--- a/pandas/tests/indexes/datetimes/test_datetime.py
+++ b/pandas/tests/indexes/datetimes/test_datetime.py
@@ -265,12 +265,10 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
# tz must be preserved
idx1 = idx1.tz_localize("Asia/Tokyo")
@@ -279,7 +277,6 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
idx2 = DatetimeIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"]
@@ -290,31 +287,21 @@ def test_factorize(self):
arr, idx = idx2.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"])
arr, idx = idx2.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
- def test_factorize_preserves_freq(self):
- # GH#38120 freq should be preserved
+ # freq must be preserved
idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
-
arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
- assert idx.freq == idx3.freq
-
- arr, idx = pd.factorize(idx3)
- tm.assert_numpy_array_equal(arr, exp_arr)
- tm.assert_index_equal(idx, idx3)
- assert idx.freq == idx3.freq
- def test_factorize_tz(self, tz_naive_fixture, index_or_series):
+ def test_factorize_tz(self, tz_naive_fixture):
tz = tz_naive_fixture
# GH#13750
base = date_range("2016-11-05", freq="H", periods=100, tz=tz)
@@ -322,33 +309,27 @@ def test_factorize_tz(self, tz_naive_fixture, index_or_series):
exp_arr = np.arange(100, dtype=np.intp).repeat(5)
- obj = index_or_series(idx)
-
- arr, res = obj.factorize()
- tm.assert_numpy_array_equal(arr, exp_arr)
- expected = base._with_freq(None)
- tm.assert_index_equal(res, expected)
- assert res.freq == expected.freq
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, exp_arr)
+ expected = base._with_freq(None)
+ tm.assert_index_equal(res, expected)
- def test_factorize_dst(self, index_or_series):
+ def test_factorize_dst(self):
# GH 13750
idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern")
- obj = index_or_series(idx)
- arr, res = obj.factorize()
- tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
- tm.assert_index_equal(res, idx)
- if index_or_series is Index:
- assert res.freq == idx.freq
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
+ tm.assert_index_equal(res, idx)
idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern")
- obj = index_or_series(idx)
- arr, res = obj.factorize()
- tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
- tm.assert_index_equal(res, idx)
- if index_or_series is Index:
- assert res.freq == idx.freq
+ for obj in [idx, pd.Series(idx)]:
+ arr, res = obj.factorize()
+ tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
+ tm.assert_index_equal(res, idx)
@pytest.mark.parametrize(
"arr, expected",
diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py
index 93772e2c27a82..c8edd30e3f7aa 100644
--- a/pandas/tests/indexes/datetimes/test_setops.py
+++ b/pandas/tests/indexes/datetimes/test_setops.py
@@ -471,11 +471,10 @@ def test_intersection_bug(self):
def test_intersection_list(self):
# GH#35876
- # values is not an Index -> no name -> retain "a"
values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")]
idx = DatetimeIndex(values, name="a")
res = idx.intersection(values)
- tm.assert_index_equal(res, idx)
+ tm.assert_index_equal(res, idx.rename(None))
def test_month_range_union_tz_pytz(self, sort):
from pytz import timezone
@@ -510,20 +509,6 @@ def test_month_range_union_tz_dateutil(self, sort):
early_dr.union(late_dr, sort=sort)
- @pytest.mark.parametrize("sort", [False, None])
- def test_intersection_duplicates(self, sort):
- # GH#38196
- idx1 = Index(
- [
- pd.Timestamp("2019-12-13"),
- pd.Timestamp("2019-12-12"),
- pd.Timestamp("2019-12-12"),
- ]
- )
- result = idx1.intersection(idx1, sort=sort)
- expected = Index([pd.Timestamp("2019-12-13"), pd.Timestamp("2019-12-12")])
- tm.assert_index_equal(result, expected)
-
class TestCustomDatetimeIndex:
def setup_method(self, method):
diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py
index fb59334b2e129..0e8d7d1ba5aba 100644
--- a/pandas/tests/indexes/interval/test_formats.py
+++ b/pandas/tests/indexes/interval/test_formats.py
@@ -1,15 +1,7 @@
import numpy as np
import pytest
-from pandas import (
- DataFrame,
- Float64Index,
- Interval,
- IntervalIndex,
- Series,
- Timedelta,
- Timestamp,
-)
+from pandas import DataFrame, IntervalIndex, Series, Timedelta, Timestamp
import pandas._testing as tm
@@ -45,25 +37,6 @@ def test_repr_missing(self, constructor, expected):
result = repr(obj)
assert result == expected
- def test_repr_floats(self):
- # GH 32553
-
- markers = Series(
- ["foo", "bar"],
- index=IntervalIndex(
- [
- Interval(left, right)
- for left, right in zip(
- Float64Index([329.973, 345.137], dtype="float64"),
- Float64Index([345.137, 360.191], dtype="float64"),
- )
- ]
- ),
- )
- result = str(markers)
- expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object"
- assert result == expected
-
@pytest.mark.parametrize(
"tuples, closed, expected_data",
[
diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py
index 0ef833bb93ded..0b94d70367b4d 100644
--- a/pandas/tests/indexes/interval/test_setops.py
+++ b/pandas/tests/indexes/interval/test_setops.py
@@ -32,17 +32,15 @@ def test_union(self, closed, sort):
tm.assert_index_equal(index.union(index, sort=sort), index)
tm.assert_index_equal(index.union(index[:1], sort=sort), index)
- def test_union_empty_result(self, closed, sort):
# GH 19101: empty result, same dtype
index = empty_index(dtype="int64", closed=closed)
result = index.union(index, sort=sort)
tm.assert_index_equal(result, index)
- # GH 19101: empty result, different dtypes -> common dtype is object
+ # GH 19101: empty result, different dtypes
other = empty_index(dtype="float64", closed=closed)
result = index.union(other, sort=sort)
- expected = Index([], dtype=object)
- tm.assert_index_equal(result, expected)
+ tm.assert_index_equal(result, index)
def test_intersection(self, closed, sort):
index = monotonic_index(0, 11, closed=closed)
diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py
index 25e2f6a3777d1..bd4926880c13d 100644
--- a/pandas/tests/indexes/multi/test_analytics.py
+++ b/pandas/tests/indexes/multi/test_analytics.py
@@ -11,8 +11,7 @@
def test_shift(idx):
# GH8083 test the base class for shift
- msg = "This method is only implemented for DatetimeIndex, PeriodIndex and "
- "TimedeltaIndex; Got type MultiIndex"
+ msg = "Not supported for type MultiIndex"
with pytest.raises(NotImplementedError, match=msg):
idx.shift(1)
with pytest.raises(NotImplementedError, match=msg):
diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py
index f7b1bc4729428..c39954b22b0f2 100644
--- a/pandas/tests/indexes/multi/test_drop.py
+++ b/pandas/tests/indexes/multi/test_drop.py
@@ -1,5 +1,3 @@
-import warnings
-
import numpy as np
import pytest
@@ -151,16 +149,6 @@ def test_drop_with_nan_in_index(nulls_fixture):
mi.drop(pd.Timestamp("2001"), level="date")
-def test_drop_with_non_monotonic_duplicates():
- # GH#33494
- mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)])
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", PerformanceWarning)
- result = mi.drop((1, 2))
- expected = MultiIndex.from_tuples([(2, 3)])
- tm.assert_index_equal(result, expected)
-
-
def test_single_level_drop_partially_missing_elements():
# GH 37820
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index 51538c556de15..4ac9a27069a3f 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -2,7 +2,7 @@
import pytest
import pandas as pd
-from pandas import Index, MultiIndex, Series
+from pandas import MultiIndex, Series
import pandas._testing as tm
@@ -294,24 +294,6 @@ def test_intersection(idx, sort):
# assert result.equals(tuples)
-def test_intersection_non_object(idx, sort):
- other = Index(range(3), name="foo")
-
- result = idx.intersection(other, sort=sort)
- expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=None)
- tm.assert_index_equal(result, expected, exact=True)
-
- # if we pass a length-0 ndarray (i.e. no name, we retain our idx.name)
- result = idx.intersection(np.asarray(other)[:0], sort=sort)
- expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=idx.names)
- tm.assert_index_equal(result, expected, exact=True)
-
- msg = "other must be a MultiIndex or a list of tuples"
- with pytest.raises(TypeError, match=msg):
- # With non-zero length non-index, we try and fail to convert to tuples
- idx.intersection(np.asarray(other), sort=sort)
-
-
def test_intersect_equal_sort():
# GH-24959
idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]])
@@ -396,26 +378,3 @@ def test_setops_disallow_true(method):
with pytest.raises(ValueError, match="The 'sort' keyword only takes"):
getattr(idx1, method)(idx2, sort=True)
-
-
-@pytest.mark.parametrize(
- ("tuples", "exp_tuples"),
- [
- ([("val1", "test1")], [("val1", "test1")]),
- ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]),
- (
- [("val2", "test2"), ("val1", "test1")],
- [("val2", "test2"), ("val1", "test1")],
- ),
- ],
-)
-def test_intersect_with_duplicates(tuples, exp_tuples):
- # GH#36915
- left = MultiIndex.from_tuples(tuples, names=["first", "second"])
- right = MultiIndex.from_tuples(
- [("val1", "test1"), ("val1", "test1"), ("val2", "test2")],
- names=["first", "second"],
- )
- result = left.intersection(right)
- expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"])
- tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py
index c03c89f32f73e..9b203e1b17517 100644
--- a/pandas/tests/indexes/period/test_indexing.py
+++ b/pandas/tests/indexes/period/test_indexing.py
@@ -21,28 +21,6 @@
)
import pandas._testing as tm
-dti4 = date_range("2016-01-01", periods=4)
-dti = dti4[:-1]
-rng = pd.Index(range(3))
-
-
-@pytest.fixture(
- params=[
- dti,
- dti.tz_localize("UTC"),
- dti.to_period("W"),
- dti - dti[0],
- rng,
- pd.Index([1, 2, 3]),
- pd.Index([2.0, 3.0, 4.0]),
- pd.Index([4, 5, 6], dtype="u8"),
- pd.IntervalIndex.from_breaks(dti4),
- ]
-)
-def non_comparable_idx(request):
- # All have length 3
- return request.param
-
class TestGetItem:
def test_ellipsis(self):
@@ -460,37 +438,6 @@ def test_get_indexer_mismatched_dtype(self):
result = pi.get_indexer_non_unique(pi2)[0]
tm.assert_numpy_array_equal(result, expected)
- def test_get_indexer_mismatched_dtype_different_length(self, non_comparable_idx):
- # without method we arent checking inequalities, so get all-missing
- # but do not raise
- dti = date_range("2016-01-01", periods=3)
- pi = dti.to_period("D")
-
- other = non_comparable_idx
-
- res = pi[:-1].get_indexer(other)
- expected = -np.ones(other.shape, dtype=np.intp)
- tm.assert_numpy_array_equal(res, expected)
-
- @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"])
- def test_get_indexer_mismatched_dtype_with_method(self, non_comparable_idx, method):
- dti = date_range("2016-01-01", periods=3)
- pi = dti.to_period("D")
-
- other = non_comparable_idx
-
- msg = re.escape(f"Cannot compare dtypes {pi.dtype} and {other.dtype}")
- with pytest.raises(TypeError, match=msg):
- pi.get_indexer(other, method=method)
-
- for dtype in ["object", "category"]:
- other2 = other.astype(dtype)
- if dtype == "object" and isinstance(other, PeriodIndex):
- continue
- # For object dtype we are liable to get a different exception message
- with pytest.raises(TypeError):
- pi.get_indexer(other2, method=method)
-
def test_get_indexer_non_unique(self):
# GH 17717
p1 = Period("2017-09-02")
diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py
index f354682bf6f70..878a89bd52cb1 100644
--- a/pandas/tests/indexes/period/test_partial_slicing.py
+++ b/pandas/tests/indexes/period/test_partial_slicing.py
@@ -94,7 +94,7 @@ def test_range_slice_outofbounds(self, make_range):
def test_maybe_cast_slice_bound(self, make_range, frame_or_series):
idx = make_range(start="2013/10/01", freq="D", periods=10)
- obj = DataFrame({"units": [100 + i for i in range(10)]}, index=idx)
+ obj = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx)
if frame_or_series is not DataFrame:
obj = obj["units"]
diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py
index 660269f2d02a4..1fd41b017221b 100644
--- a/pandas/tests/indexes/ranges/test_setops.py
+++ b/pandas/tests/indexes/ranges/test_setops.py
@@ -3,52 +3,11 @@
import numpy as np
import pytest
-from pandas import Index, Int64Index, RangeIndex, UInt64Index
+from pandas import Index, Int64Index, RangeIndex
import pandas._testing as tm
class TestRangeIndexSetOps:
- @pytest.mark.parametrize("klass", [RangeIndex, Int64Index, UInt64Index])
- def test_intersection_mismatched_dtype(self, klass):
- # check that we cast to float, not object
- index = RangeIndex(start=0, stop=20, step=2, name="foo")
- index = klass(index)
-
- flt = index.astype(np.float64)
-
- # bc index.equals(flt), we go through fastpath and get RangeIndex back
- result = index.intersection(flt)
- tm.assert_index_equal(result, index, exact=True)
-
- result = flt.intersection(index)
- tm.assert_index_equal(result, flt, exact=True)
-
- # neither empty, not-equals
- result = index.intersection(flt[1:])
- tm.assert_index_equal(result, flt[1:], exact=True)
-
- result = flt[1:].intersection(index)
- tm.assert_index_equal(result, flt[1:], exact=True)
-
- # empty other
- result = index.intersection(flt[:0])
- tm.assert_index_equal(result, flt[:0], exact=True)
-
- result = flt[:0].intersection(index)
- tm.assert_index_equal(result, flt[:0], exact=True)
-
- def test_intersection_empty(self, sort, names):
- # name retention on empty intersections
- index = RangeIndex(start=0, stop=20, step=2, name=names[0])
-
- # empty other
- result = index.intersection(index[:0].rename(names[1]), sort=sort)
- tm.assert_index_equal(result, index[:0].rename(names[2]), exact=True)
-
- # empty self
- result = index[:0].intersection(index.rename(names[1]), sort=sort)
- tm.assert_index_equal(result, index[:0].rename(names[2]), exact=True)
-
def test_intersection(self, sort):
# intersect with Int64Index
index = RangeIndex(start=0, stop=20, step=2)
@@ -90,12 +49,12 @@ def test_intersection(self, sort):
result = other.intersection(first, sort=sort).astype(int)
tm.assert_index_equal(result, expected)
- index = RangeIndex(5, name="foo")
+ index = RangeIndex(5)
# intersect of non-overlapping indices
- other = RangeIndex(5, 10, 1, name="foo")
+ other = RangeIndex(5, 10, 1)
result = index.intersection(other, sort=sort)
- expected = RangeIndex(0, 0, 1, name="foo")
+ expected = RangeIndex(0, 0, 1)
tm.assert_index_equal(result, expected)
other = RangeIndex(-1, -5, -1)
@@ -112,12 +71,11 @@ def test_intersection(self, sort):
result = other.intersection(index, sort=sort)
tm.assert_index_equal(result, expected)
- def test_intersection_non_overlapping_gcd(self, sort, names):
# intersection of non-overlapping values based on start value and gcd
- index = RangeIndex(1, 10, 2, name=names[0])
- other = RangeIndex(0, 10, 4, name=names[1])
+ index = RangeIndex(1, 10, 2)
+ other = RangeIndex(0, 10, 4)
result = index.intersection(other, sort=sort)
- expected = RangeIndex(0, 0, 1, name=names[2])
+ expected = RangeIndex(0, 0, 1)
tm.assert_index_equal(result, expected)
def test_union_noncomparable(self, sort):
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 372a1d290bca0..ba49c51c9db8e 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -9,7 +9,6 @@
import pytest
from pandas._libs.tslib import Timestamp
-from pandas.compat import IS64
from pandas.compat.numpy import np_datetime64_compat
from pandas.util._test_decorators import async_mark
@@ -20,7 +19,6 @@
DatetimeIndex,
Float64Index,
Int64Index,
- IntervalIndex,
PeriodIndex,
RangeIndex,
Series,
@@ -1251,9 +1249,10 @@ def test_get_indexer_numeric_index_boolean_target(self, method, idx_class):
if method == "get_indexer":
tm.assert_numpy_array_equal(result, expected)
else:
- missing = np.arange(3, dtype=np.intp)
+ expected = np.array([-1, -1, -1, -1], dtype=np.intp)
+
tm.assert_numpy_array_equal(result[0], expected)
- tm.assert_numpy_array_equal(result[1], missing)
+ tm.assert_numpy_array_equal(result[1], expected)
def test_get_indexer_with_NA_values(
self, unique_nulls_fixture, unique_nulls_fixture2
@@ -1506,17 +1505,6 @@ def test_drop_tuple(self, values, to_drop):
with pytest.raises(KeyError, match=msg):
removed.drop(drop_me)
- def test_drop_with_duplicates_in_index(self, index):
- # GH38051
- if len(index) == 0 or isinstance(index, MultiIndex):
- return
- if isinstance(index, IntervalIndex) and not IS64:
- pytest.skip("Cannot test IntervalIndex with int64 dtype on 32 bit platform")
- index = index.unique().repeat(2)
- expected = index[2:]
- result = index.drop(index[0])
- tm.assert_index_equal(result, expected)
-
@pytest.mark.parametrize(
"attr",
[
@@ -2358,6 +2346,5 @@ def construct(dtype):
else:
no_matches = np.array([-1] * 6, dtype=np.intp)
- missing = np.arange(6, dtype=np.intp)
tm.assert_numpy_array_equal(result[0], no_matches)
- tm.assert_numpy_array_equal(result[1], missing)
+ tm.assert_numpy_array_equal(result[1], no_matches)
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 8bfb97ca494e6..b71417b2a625d 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -74,16 +74,14 @@ def test_numpy_ufuncs_basic(index, func):
@pytest.mark.parametrize(
"func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__
)
-def test_numpy_ufuncs_other(index, func, request):
+def test_numpy_ufuncs_other(index, func):
# test ufuncs of numpy, see:
# https://numpy.org/doc/stable/reference/ufuncs.html
if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
if isinstance(index, DatetimeIndex) and index.tz is not None:
if func in [np.isfinite, np.isnan, np.isinf]:
- if not np_version_under1p17:
- mark = pytest.mark.xfail(reason="__array_ufunc__ is not defined")
- request.node.add_marker(mark)
+ pytest.xfail(reason="__array_ufunc__ is not defined")
if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]:
# numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index 6f949960ce30b..0973cef7cfdc1 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -98,20 +98,13 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2):
("Period[D]", "float64", "object"),
],
)
-@pytest.mark.parametrize("names", [("foo", "foo", "foo"), ("foo", "bar", None)])
-def test_union_dtypes(left, right, expected, names):
+def test_union_dtypes(left, right, expected):
left = pandas_dtype(left)
right = pandas_dtype(right)
- a = pd.Index([], dtype=left, name=names[0])
- b = pd.Index([], dtype=right, name=names[1])
- result = a.union(b)
- assert result.dtype == expected
- assert result.name == names[2]
-
- # Testing name retention
- # TODO: pin down desired dtype; do we want it to be commutative?
- result = a.intersection(b)
- assert result.name == names[2]
+ a = pd.Index([], dtype=left)
+ b = pd.Index([], dtype=right)
+ result = a.union(b).dtype
+ assert result == expected
def test_dunder_inplace_setops_deprecated(index):
@@ -127,16 +120,6 @@ def test_dunder_inplace_setops_deprecated(index):
index ^= index
-@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]])
-def test_intersection_duplicates(values):
- # GH#31326
- a = pd.Index(values)
- b = pd.Index([3, 3])
- result = a.intersection(b)
- expected = pd.Index([3])
- tm.assert_index_equal(result, expected)
-
-
class TestSetOps:
# Set operation tests shared by all indexes in the `index` fixture
@pytest.mark.parametrize("case", [0.5, "xxx"])
@@ -395,25 +378,6 @@ def test_intersect_unequal(self, index, fname, sname, expected_name):
expected = index[1:].set_names(expected_name).sort_values()
tm.assert_index_equal(intersect, expected)
- def test_intersection_name_retention_with_nameless(self, index):
- if isinstance(index, MultiIndex):
- index = index.rename(list(range(index.nlevels)))
- else:
- index = index.rename("foo")
-
- other = np.asarray(index)
-
- result = index.intersection(other)
- assert result.name == index.name
-
- # empty other, same dtype
- result = index.intersection(other[:0])
- assert result.name == index.name
-
- # empty `self`
- result = index[:0].intersection(other)
- assert result.name == index.name
-
def test_difference_preserves_type_empty(self, index, sort):
# GH#20040
# If taking difference of a set and itself, it
@@ -424,18 +388,6 @@ def test_difference_preserves_type_empty(self, index, sort):
expected = index[:0]
tm.assert_index_equal(result, expected, exact=True)
- def test_difference_name_retention_equals(self, index, sort, names):
- if isinstance(index, MultiIndex):
- names = [[x] * index.nlevels for x in names]
- index = index.rename(names[0])
- other = index.rename(names[1])
-
- assert index.equals(other)
-
- result = index.difference(other)
- expected = index[:0].rename(names[2])
- tm.assert_index_equal(result, expected)
-
def test_intersection_difference_match_empty(self, index, sort):
# GH#20040
# Test that the intersection of an index with an
diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py
index f0e730eecf3d5..774370ed866da 100644
--- a/pandas/tests/indexes/timedeltas/test_timedelta.py
+++ b/pandas/tests/indexes/timedeltas/test_timedelta.py
@@ -75,26 +75,17 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
- assert idx.freq == exp_idx.freq
- def test_factorize_preserves_freq(self):
- # GH#38120 freq should be preserved
+ # freq must be preserved
idx3 = timedelta_range("1 day", periods=4, freq="s")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
- assert idx.freq == idx3.freq
-
- arr, idx = pd.factorize(idx3)
- tm.assert_numpy_array_equal(arr, exp_arr)
- tm.assert_index_equal(idx, idx3)
- assert idx.freq == idx3.freq
def test_sort_values(self):
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 9a3039c28416c..a3b8d66c92024 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -83,13 +83,3 @@ def test_nested_tuples_duplicates(self):
df3 = df.copy(deep=True)
df3.loc[[(dti[0], "a")], "c2"] = 1.0
tm.assert_frame_equal(df3, expected)
-
- def test_multiindex_with_datatime_level_preserves_freq(self):
- # https://github.com/pandas-dev/pandas/issues/35563
- idx = Index(range(2), name="A")
- dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
- mi = MultiIndex.from_product([idx, dti])
- df = DataFrame(np.random.randn(14, 2), index=mi)
- result = df.loc[0].index
- tm.assert_index_equal(result, dti)
- assert result.freq == dti.freq
diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py
index 51684f092aefd..d58bc4713f99f 100644
--- a/pandas/tests/indexing/multiindex/test_slice.py
+++ b/pandas/tests/indexing/multiindex/test_slice.py
@@ -779,13 +779,3 @@ def test_non_reducing_slice_on_multiindex(self):
result = df.loc[tslice_]
expected = DataFrame({("b", "d"): [4, 1]})
tm.assert_frame_equal(result, expected)
-
- def test_loc_slice_negative_stepsize(self):
- # GH#38071
- mi = MultiIndex.from_product([["a", "b"], [0, 1]])
- df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=mi)
- result = df.loc[("a", slice(None, None, -1)), :]
- expected = DataFrame(
- [[3, 4], [1, 2]], index=MultiIndex.from_tuples([("a", 1), ("a", 0)])
- )
- tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index 1b9b6452b2e33..6fff706e27cd2 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -57,12 +57,9 @@ def test_loc_scalar(self):
with pytest.raises(KeyError, match=r"^'d'$"):
df.loc["d"]
- df2 = df.copy()
- expected = df2.copy()
- expected.index = expected.index.astype(object)
- expected.loc["d"] = 10
- df2.loc["d"] = 10
- tm.assert_frame_equal(df2, expected)
+ msg = "cannot append a non-category item to a CategoricalIndex"
+ with pytest.raises(TypeError, match=msg):
+ df.loc["d"] = 10
msg = "'fill_value=d' is not present in this Categorical's categories"
with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/indexing/test_indexers.py b/pandas/tests/indexing/test_indexers.py
index 14b2b494d65fb..744f9441e7376 100644
--- a/pandas/tests/indexing/test_indexers.py
+++ b/pandas/tests/indexing/test_indexers.py
@@ -28,12 +28,6 @@ def test_is_scalar_indexer():
assert not is_scalar_indexer(slice(None), 1)
- indexer = 0
- assert is_scalar_indexer(indexer, 1)
-
- indexer = (0,)
- assert is_scalar_indexer(indexer, 1)
-
class TestValidateIndices:
def test_validate_indices_ok(self):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 68f12a939e061..cf6c2878acd9a 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1782,23 +1782,21 @@ def test_series_getitem_label_list_missing_integer_values():
@pytest.mark.parametrize(
- "columns, column_key, expected_columns",
+ "columns, column_key, expected_columns, check_column_type",
[
- ([2011, 2012, 2013], [2011, 2012], [0, 1]),
- ([2011, 2012, "All"], [2011, 2012], [0, 1]),
- ([2011, 2012, "All"], [2011, "All"], [0, 2]),
+ ([2011, 2012, 2013], [2011, 2012], [0, 1], True),
+ ([2011, 2012, "All"], [2011, 2012], [0, 1], False),
+ ([2011, 2012, "All"], [2011, "All"], [0, 2], True),
],
)
-def test_loc_getitem_label_list_integer_labels(columns, column_key, expected_columns):
+def test_loc_getitem_label_list_integer_labels(
+ columns, column_key, expected_columns, check_column_type
+):
# gh-14836
df = DataFrame(np.random.rand(3, 3), columns=columns, index=list("ABC"))
expected = df.iloc[:, expected_columns]
result = df.loc[["A", "B", "C"], column_key]
-
- if df.columns.is_object() and all(isinstance(x, int) for x in column_key):
- expected.columns = expected.columns.astype(int)
-
- tm.assert_frame_equal(result, expected, check_column_type=True)
+ tm.assert_frame_equal(result, expected, check_column_type=check_column_type)
def test_loc_setitem_float_intindex():
@@ -2072,21 +2070,3 @@ def test_loc_setitem_dt64tz_values(self):
s2["a"] = expected
result = s2["a"]
assert result == expected
-
- @pytest.mark.parametrize("array_fn", [np.array, pd.array, list, tuple])
- @pytest.mark.parametrize("size", [0, 4, 5, 6])
- def test_loc_iloc_setitem_with_listlike(self, size, array_fn):
- # GH37748
- # testing insertion, in a Series of size N (here 5), of a listlike object
- # of size 0, N-1, N, N+1
-
- arr = array_fn([0] * size)
- expected = Series([arr, 0, 0, 0, 0], index=list("abcde"), dtype=object)
-
- ser = Series(0, index=list("abcde"), dtype=object)
- ser.loc["a"] = arr
- tm.assert_series_equal(ser, expected)
-
- ser = Series(0, index=list("abcde"), dtype=object)
- ser.iloc[0] = arr
- tm.assert_series_equal(ser, expected)
diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py
index ce48fd1e5c905..dd01f4e6a4f49 100644
--- a/pandas/tests/indexing/test_scalar.py
+++ b/pandas/tests/indexing/test_scalar.py
@@ -268,41 +268,35 @@ def test_at_with_tuple_index_set():
assert series.at[1, 2] == 3
-class TestMultiIndexScalar:
- def test_multiindex_at_get(self):
- # GH 26989
- # DataFrame.at and DataFrame.loc getter works with MultiIndex
- df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]])
- assert df.index.nlevels == 2
- assert df.at[(1, 3), "a"] == 1
- assert df.loc[(1, 3), "a"] == 1
-
- # Series.at and Series.loc getter works with MultiIndex
- series = df["a"]
- assert series.index.nlevels == 2
- assert series.at[1, 3] == 1
- assert series.loc[1, 3] == 1
-
- def test_multiindex_at_set(self):
- # GH 26989
- # DataFrame.at and DataFrame.loc setter works with MultiIndex
- df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]])
- assert df.index.nlevels == 2
- df.at[(1, 3), "a"] = 3
- assert df.at[(1, 3), "a"] == 3
- df.loc[(1, 3), "a"] = 4
- assert df.loc[(1, 3), "a"] == 4
-
- # Series.at and Series.loc setter works with MultiIndex
- series = df["a"]
- assert series.index.nlevels == 2
- series.at[1, 3] = 5
- assert series.at[1, 3] == 5
- series.loc[1, 3] = 6
- assert series.loc[1, 3] == 6
-
- def test_multiindex_at_get_one_level(self):
- # GH#38053
- s2 = Series((0, 1), index=[[False, True]])
- result = s2.at[False]
- assert result == 0
+def test_multiindex_at_get():
+ # GH 26989
+ # DataFrame.at and DataFrame.loc getter works with MultiIndex
+ df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]])
+ assert df.index.nlevels == 2
+ assert df.at[(1, 3), "a"] == 1
+ assert df.loc[(1, 3), "a"] == 1
+
+ # Series.at and Series.loc getter works with MultiIndex
+ series = df["a"]
+ assert series.index.nlevels == 2
+ assert series.at[1, 3] == 1
+ assert series.loc[1, 3] == 1
+
+
+def test_multiindex_at_set():
+ # GH 26989
+ # DataFrame.at and DataFrame.loc setter works with MultiIndex
+ df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]])
+ assert df.index.nlevels == 2
+ df.at[(1, 3), "a"] = 3
+ assert df.at[(1, 3), "a"] == 3
+ df.loc[(1, 3), "a"] = 4
+ assert df.loc[(1, 3), "a"] == 4
+
+ # Series.at and Series.loc setter works with MultiIndex
+ series = df["a"]
+ assert series.index.nlevels == 2
+ series.at[1, 3] = 5
+ assert series.at[1, 3] == 5
+ series.loc[1, 3] = 6
+ assert series.loc[1, 3] == 6
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
index bcc666a88e3be..e9f228b5973b5 100644
--- a/pandas/tests/io/conftest.py
+++ b/pandas/tests/io/conftest.py
@@ -1,4 +1,3 @@
-import logging
import os
import shlex
import subprocess
@@ -50,8 +49,6 @@ def s3_base(worker_id):
pytest.importorskip("s3fs")
pytest.importorskip("boto3")
requests = pytest.importorskip("requests")
- # GH 38090: Suppress http logs in tests by moto_server
- logging.getLogger("werkzeug").disabled = True
with tm.ensure_safe_environment_variables():
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py
index 3155e22d3ff5d..1349808277d81 100644
--- a/pandas/tests/io/excel/test_openpyxl.py
+++ b/pandas/tests/io/excel/test_openpyxl.py
@@ -68,11 +68,11 @@ def test_write_cells_merge_styled(ext):
]
with tm.ensure_clean(ext) as path:
- with _OpenpyxlWriter(path) as writer:
- writer.write_cells(initial_cells, sheet_name=sheet_name)
- writer.write_cells(merge_cells, sheet_name=sheet_name)
+ writer = _OpenpyxlWriter(path)
+ writer.write_cells(initial_cells, sheet_name=sheet_name)
+ writer.write_cells(merge_cells, sheet_name=sheet_name)
- wks = writer.sheets[sheet_name]
+ wks = writer.sheets[sheet_name]
xcell_b1 = wks["B1"]
xcell_a2 = wks["A2"]
assert xcell_b1.font == openpyxl_sty_merged
@@ -93,8 +93,9 @@ def test_write_append_mode(ext, mode, expected):
wb.worksheets[1]["A1"].value = "bar"
wb.save(f)
- with ExcelWriter(f, engine="openpyxl", mode=mode) as writer:
- df.to_excel(writer, sheet_name="baz", index=False)
+ writer = ExcelWriter(f, engine="openpyxl", mode=mode)
+ df.to_excel(writer, sheet_name="baz", index=False)
+ writer.save()
wb2 = openpyxl.load_workbook(f)
result = [sheet.title for sheet in wb2.worksheets]
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 98a55ae39bd77..c582a0fa23577 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -577,10 +577,6 @@ def test_date_conversion_overflow(self, read_ext):
if pd.read_excel.keywords["engine"] == "openpyxl":
pytest.xfail("Maybe not supported by openpyxl")
- if pd.read_excel.keywords["engine"] is None:
- # GH 35029
- pytest.xfail("Defaults to openpyxl, maybe not supported")
-
result = pd.read_excel("testdateoverflow" + read_ext)
tm.assert_frame_equal(result, expected)
@@ -1163,7 +1159,7 @@ def test_excel_high_surrogate(self, engine):
expected = DataFrame(["\udc88"], columns=["Column1"])
# should not produce a segmentation violation
- actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
+ actual = pd.read_excel("high_surrogate.xlsx")
tm.assert_frame_equal(expected, actual)
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py
index 6b1abebe0506a..936fc175a493b 100644
--- a/pandas/tests/io/excel/test_style.py
+++ b/pandas/tests/io/excel/test_style.py
@@ -68,14 +68,15 @@ def custom_converter(css):
df = DataFrame(np.random.randn(11, 3))
with tm.ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path:
- with ExcelWriter(path, engine=engine) as writer:
- df.to_excel(writer, sheet_name="frame")
- df.style.to_excel(writer, sheet_name="unstyled")
- styled = df.style.apply(style, axis=None)
- styled.to_excel(writer, sheet_name="styled")
- ExcelFormatter(styled, style_converter=custom_converter).write(
- writer, sheet_name="custom"
- )
+ writer = ExcelWriter(path, engine=engine)
+ df.to_excel(writer, sheet_name="frame")
+ df.style.to_excel(writer, sheet_name="unstyled")
+ styled = df.style.apply(style, axis=None)
+ styled.to_excel(writer, sheet_name="styled")
+ ExcelFormatter(styled, style_converter=custom_converter).write(
+ writer, sheet_name="custom"
+ )
+ writer.save()
if engine not in ("openpyxl", "xlsxwriter"):
# For other engines, we only smoke test
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index 80ebeb4c03d89..8da9c79160e91 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -351,15 +351,12 @@ def test_excel_sheet_by_name_raise(self, path, engine):
msg = "sheet 0 not found"
with pytest.raises(ValueError, match=msg):
pd.read_excel(xl, "0")
- elif engine == "xlwt":
+ else:
import xlrd
msg = "No sheet named <'0'>"
with pytest.raises(xlrd.XLRDError, match=msg):
pd.read_excel(xl, sheet_name="0")
- else:
- with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
- pd.read_excel(xl, sheet_name="0")
def test_excel_writer_context_manager(self, frame, path):
with ExcelWriter(path) as writer:
@@ -472,12 +469,12 @@ def test_int_types(self, np_type, path):
# Test with convert_float=False comes back as float.
float_frame = df.astype(float)
- float_frame.columns = float_frame.columns.astype(float)
- float_frame.index = float_frame.index.astype(float)
recons = pd.read_excel(
path, sheet_name="test1", convert_float=False, index_col=0
)
- tm.assert_frame_equal(recons, float_frame)
+ tm.assert_frame_equal(
+ recons, float_frame, check_index_type=False, check_column_type=False
+ )
@pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64])
def test_float_types(self, np_type, path):
@@ -525,9 +522,10 @@ def test_sheets(self, frame, tsframe, path):
frame.to_excel(path, "test1", index=False)
# Test writing to separate sheets
- with ExcelWriter(path) as writer:
- frame.to_excel(writer, "test1")
- tsframe.to_excel(writer, "test2")
+ writer = ExcelWriter(path)
+ frame.to_excel(writer, "test1")
+ tsframe.to_excel(writer, "test2")
+ writer.close()
reader = ExcelFile(path)
recons = pd.read_excel(reader, sheet_name="test1", index_col=0)
tm.assert_frame_equal(frame, recons)
@@ -1195,24 +1193,23 @@ def test_datetimes(self, path):
write_frame = DataFrame({"A": datetimes})
write_frame.to_excel(path, "Sheet1")
- # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
- engine = "odf" if path.endswith("ods") else "xlrd"
- read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
+ read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
tm.assert_series_equal(write_frame["A"], read_frame["A"])
def test_bytes_io(self, engine):
# see gh-7074
- with BytesIO() as bio:
- df = DataFrame(np.random.randn(10, 2))
+ bio = BytesIO()
+ df = DataFrame(np.random.randn(10, 2))
- # Pass engine explicitly, as there is no file path to infer from.
- with ExcelWriter(bio, engine=engine) as writer:
- df.to_excel(writer)
+ # Pass engine explicitly, as there is no file path to infer from.
+ writer = ExcelWriter(bio, engine=engine)
+ df.to_excel(writer)
+ writer.save()
- bio.seek(0)
- reread_df = pd.read_excel(bio, index_col=0)
- tm.assert_frame_equal(df, reread_df)
+ bio.seek(0)
+ reread_df = pd.read_excel(bio, index_col=0)
+ tm.assert_frame_equal(df, reread_df)
def test_write_lists_dict(self, path):
# see gh-8188.
@@ -1320,12 +1317,12 @@ class TestExcelWriterEngineTests:
)
def test_ExcelWriter_dispatch(self, klass, ext):
with tm.ensure_clean(ext) as path:
- with ExcelWriter(path) as writer:
- if ext == ".xlsx" and td.safe_import("xlsxwriter"):
- # xlsxwriter has preference over openpyxl if both installed
- assert isinstance(writer, _XlsxWriter)
- else:
- assert isinstance(writer, klass)
+ writer = ExcelWriter(path)
+ if ext == ".xlsx" and td.safe_import("xlsxwriter"):
+ # xlsxwriter has preference over openpyxl if both installed
+ assert isinstance(writer, _XlsxWriter)
+ else:
+ assert isinstance(writer, klass)
def test_ExcelWriter_dispatch_raises(self):
with pytest.raises(ValueError, match="No engine"):
@@ -1359,8 +1356,8 @@ def check_called(func):
path = "something.xlsx"
with tm.ensure_clean(path) as filepath:
register_writer(DummyClass)
- with ExcelWriter(filepath) as writer:
- assert isinstance(writer, DummyClass)
+ writer = ExcelWriter(filepath)
+ assert isinstance(writer, DummyClass)
df = tm.makeCustomDataframe(1, 1)
check_called(lambda: df.to_excel(filepath))
with tm.ensure_clean("something.xls") as filepath:
@@ -1380,5 +1377,5 @@ def test_excelfile_fspath(self):
def test_excelwriter_fspath(self):
with tm.ensure_clean("foo.xlsx") as path:
- with ExcelWriter(path) as writer:
- assert os.fspath(writer) == str(path)
+ writer = ExcelWriter(path)
+ assert os.fspath(writer) == str(path)
diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py
index f2fbcbc2e2f04..26190edaa4960 100644
--- a/pandas/tests/io/excel/test_xlrd.py
+++ b/pandas/tests/io/excel/test_xlrd.py
@@ -1,7 +1,5 @@
import pytest
-from pandas.compat._optional import import_optional_dependency
-
import pandas as pd
import pandas._testing as tm
@@ -40,48 +38,6 @@ def test_read_xlrd_book(read_ext, frame):
# TODO: test for openpyxl as well
def test_excel_table_sheet_by_index(datapath, read_ext):
path = datapath("io", "data", "excel", f"test1{read_ext}")
- with ExcelFile(path, engine="xlrd") as excel:
+ with ExcelFile(path) as excel:
with pytest.raises(xlrd.XLRDError):
pd.read_excel(excel, sheet_name="asdf")
-
-
-def test_excel_file_warning_with_xlsx_file(datapath):
- # GH 29375
- path = datapath("io", "data", "excel", "test1.xlsx")
- has_openpyxl = (
- import_optional_dependency(
- "openpyxl", raise_on_missing=False, on_version="ignore"
- )
- is not None
- )
- if not has_openpyxl:
- with tm.assert_produces_warning(
- FutureWarning,
- raise_on_extra_warnings=False,
- match="The xlrd engine is no longer maintained",
- ):
- ExcelFile(path, engine=None)
- else:
- with tm.assert_produces_warning(None):
- pd.read_excel(path, "Sheet1", engine=None)
-
-
-def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
- # GH 29375
- path = datapath("io", "data", "excel", "test1.xlsx")
- has_openpyxl = (
- import_optional_dependency(
- "openpyxl", raise_on_missing=False, on_version="ignore"
- )
- is not None
- )
- if not has_openpyxl:
- with tm.assert_produces_warning(
- FutureWarning,
- raise_on_extra_warnings=False,
- match="The xlrd engine is no longer maintained",
- ):
- pd.read_excel(path, "Sheet1", engine=None)
- else:
- with tm.assert_produces_warning(None):
- pd.read_excel(path, "Sheet1", engine=None)
diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py
index 6de378f6a3d3e..b6f791434a92b 100644
--- a/pandas/tests/io/excel/test_xlsxwriter.py
+++ b/pandas/tests/io/excel/test_xlsxwriter.py
@@ -23,15 +23,16 @@ def test_column_format(ext):
with tm.ensure_clean(ext) as path:
frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
- with ExcelWriter(path) as writer:
- frame.to_excel(writer)
-
- # Add a number format to col B and ensure it is applied to cells.
- num_format = "#,##0"
- write_workbook = writer.book
- write_worksheet = write_workbook.worksheets()[0]
- col_format = write_workbook.add_format({"num_format": num_format})
- write_worksheet.set_column("B:B", None, col_format)
+ writer = ExcelWriter(path)
+ frame.to_excel(writer)
+
+ # Add a number format to col B and ensure it is applied to cells.
+ num_format = "#,##0"
+ write_workbook = writer.book
+ write_worksheet = write_workbook.worksheets()[0]
+ col_format = write_workbook.add_format({"num_format": num_format})
+ write_worksheet.set_column("B:B", None, col_format)
+ writer.save()
read_workbook = openpyxl.load_workbook(path)
try:
diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py
index 4f1af132204bb..8529a0fb33b67 100644
--- a/pandas/tests/io/formats/test_to_excel.py
+++ b/pandas/tests/io/formats/test_to_excel.py
@@ -278,7 +278,7 @@ def test_css_to_excel_good_colors(input_color, output_color):
f"color: {input_color}"
)
- expected = {}
+ expected = dict()
expected["fill"] = {"patternType": "solid", "fgColor": output_color}
@@ -305,7 +305,7 @@ def test_css_to_excel_bad_colors(input_color):
f"color: {input_color}"
)
- expected = {}
+ expected = dict()
if input_color is not None:
expected["fill"] = {"patternType": "solid"}
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py
index a88dec84bd693..aaadc965aca52 100644
--- a/pandas/tests/io/formats/test_to_html.py
+++ b/pandas/tests/io/formats/test_to_html.py
@@ -152,8 +152,8 @@ def test_to_html_decimal(datapath):
@pytest.mark.parametrize(
"kwargs,string,expected",
[
- ({}, "", "escaped"),
- ({"escape": False}, "bold", "escape_disabled"),
+ (dict(), "", "escaped"),
+ (dict(escape=False), "bold", "escape_disabled"),
],
)
def test_to_html_escaped(kwargs, string, expected, datapath):
diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
index ba6d7c010613b..81e8e0bd2b526 100644
--- a/pandas/tests/io/formats/test_to_latex.py
+++ b/pandas/tests/io/formats/test_to_latex.py
@@ -92,7 +92,7 @@ def test_to_latex_tabular_without_index(self):
@pytest.mark.parametrize(
"bad_column_format",
- [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, {"a": "r", "b": "l"}],
+ [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, dict(a="r", b="l")],
)
def test_to_latex_bad_column_format(self, bad_column_format):
df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
index 5faca6bd89dad..a41af9886c617 100644
--- a/pandas/tests/io/json/test_compression.py
+++ b/pandas/tests/io/json/test_compression.py
@@ -65,10 +65,8 @@ def test_chunksize_with_compression(compression):
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient="records", lines=True, compression=compression)
- with pd.read_json(
- path, lines=True, chunksize=1, compression=compression
- ) as res:
- roundtripped_df = pd.concat(res)
+ res = pd.read_json(path, lines=True, chunksize=1, compression=compression)
+ roundtripped_df = pd.concat(res)
tm.assert_frame_equal(df, roundtripped_df)
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
index 244302e34337d..8d93fbcc063f4 100644
--- a/pandas/tests/io/json/test_normalize.py
+++ b/pandas/tests/io/json/test_normalize.py
@@ -521,13 +521,13 @@ def test_meta_non_iterable(self):
class TestNestedToRecord:
def test_flat_stays_flat(self):
- recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
+ recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4)]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
- data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
+ data = dict(flat1=1, dict1=dict(c=1, d=2))
result = nested_to_record(data)
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
@@ -535,11 +535,7 @@ def test_one_level_deep_flattens(self):
assert result == expected
def test_nested_flattens(self):
- data = {
- "flat1": 1,
- "dict1": {"c": 1, "d": 2},
- "nested": {"e": {"c": 1, "d": 2}, "d": 2},
- }
+ data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
result = nested_to_record(data)
expected = {
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index ce95eb59ed3c4..fdf2caa804def 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -379,7 +379,7 @@ def test_frame_infinity(self, orient, inf, dtype):
],
)
def test_frame_to_json_float_precision(self, value, precision, expected_val):
- df = DataFrame([{"a_float": value}])
+ df = DataFrame([dict(a_float=value)])
encoded = df.to_json(double_precision=precision)
assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}'
@@ -475,8 +475,8 @@ def test_blocks_compat_GH9037(self):
index = DatetimeIndex(list(index), freq=None)
df_mixed = DataFrame(
- {
- "float_1": [
+ dict(
+ float_1=[
-0.92077639,
0.77434435,
1.25234727,
@@ -488,7 +488,7 @@ def test_blocks_compat_GH9037(self):
0.95748401,
-1.02970536,
],
- "int_1": [
+ int_1=[
19680418,
75337055,
99973684,
@@ -500,7 +500,7 @@ def test_blocks_compat_GH9037(self):
41903419,
16008365,
],
- "str_1": [
+ str_1=[
"78c608f1",
"64a99743",
"13d2ff52",
@@ -512,7 +512,7 @@ def test_blocks_compat_GH9037(self):
"7a669144",
"8d64d068",
],
- "float_2": [
+ float_2=[
-0.0428278,
-1.80872357,
3.36042349,
@@ -524,7 +524,7 @@ def test_blocks_compat_GH9037(self):
-0.03030452,
1.43366348,
],
- "str_2": [
+ str_2=[
"14f04af9",
"d085da90",
"4bcfac83",
@@ -536,7 +536,7 @@ def test_blocks_compat_GH9037(self):
"1f6a09ba",
"4bfc4d87",
],
- "int_2": [
+ int_2=[
86967717,
98098830,
51927505,
@@ -548,7 +548,7 @@ def test_blocks_compat_GH9037(self):
24867120,
76131025,
],
- },
+ ),
index=index,
)
@@ -727,7 +727,9 @@ def test_series_with_dtype_datetime(self, dtype, expected):
def test_frame_from_json_precise_float(self):
df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
result = read_json(df.to_json(), precise_float=True)
- tm.assert_frame_equal(result, df)
+ tm.assert_frame_equal(
+ result, df, check_index_type=False, check_column_type=False
+ )
def test_typ(self):
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
index 4bbd81ada995b..2e68d3306c7d1 100644
--- a/pandas/tests/io/json/test_readlines.py
+++ b/pandas/tests/io/json/test_readlines.py
@@ -77,8 +77,8 @@ def test_readjson_chunks(lines_json_df, chunksize):
# GH17048: memory usage when lines=True
unchunked = read_json(StringIO(lines_json_df), lines=True)
- with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader:
- chunked = pd.concat(reader)
+ reader = read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize)
+ chunked = pd.concat(reader)
tm.assert_frame_equal(chunked, unchunked)
@@ -86,8 +86,7 @@ def test_readjson_chunks(lines_json_df, chunksize):
def test_readjson_chunksize_requires_lines(lines_json_df):
msg = "chunksize can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
- with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _:
- pass
+ pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
def test_readjson_chunks_series():
@@ -98,8 +97,7 @@ def test_readjson_chunks_series():
unchunked = pd.read_json(strio, lines=True, typ="Series")
strio = StringIO(s.to_json(lines=True, orient="records"))
- with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader:
- chunked = pd.concat(reader)
+ chunked = pd.concat(pd.read_json(strio, lines=True, typ="Series", chunksize=1))
tm.assert_series_equal(chunked, unchunked)
@@ -107,8 +105,7 @@ def test_readjson_chunks_series():
def test_readjson_each_chunk(lines_json_df):
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
- with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader:
- chunks = list(reader)
+ chunks = list(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2))
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
@@ -117,8 +114,7 @@ def test_readjson_chunks_from_file():
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
- with pd.read_json(path, lines=True, chunksize=1) as reader:
- chunked = pd.concat(reader)
+ chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
unchunked = pd.read_json(path, lines=True)
tm.assert_frame_equal(unchunked, chunked)
@@ -145,8 +141,7 @@ def test_readjson_chunks_closes(chunksize):
compression=None,
nrows=None,
)
- with reader:
- reader.read()
+ reader.read()
assert (
reader.handles.handle.closed
), f"didn't close stream with chunksize = {chunksize}"
@@ -157,10 +152,7 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
- with pd.read_json(
- StringIO(lines_json_df), lines=True, chunksize=chunksize
- ) as _:
- pass
+ pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [None, 1, 2])
@@ -184,8 +176,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize):
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
test = pd.read_json(j, lines=True, chunksize=chunksize)
if chunksize is not None:
- with test:
- test = pd.concat(test)
+ test = pd.concat(test)
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
@@ -221,8 +212,8 @@ def test_readjson_nrows_chunks(nrows, chunksize):
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
- with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader:
- chunked = pd.concat(reader)
+ reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize)
+ chunked = pd.concat(reader)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)
@@ -249,6 +240,6 @@ def test_readjson_lines_chunks_fileurl(datapath):
]
os_path = datapath("io", "json", "data", "line_delimited.json")
file_url = Path(os_path).as_uri()
- with pd.read_json(file_url, lines=True, chunksize=1) as url_reader:
- for index, chuck in enumerate(url_reader):
- tm.assert_frame_equal(chuck, df_list_expected[index])
+ url_reader = pd.read_json(file_url, lines=True, chunksize=1)
+ for index, chuck in enumerate(url_reader):
+ tm.assert_frame_equal(chuck, df_list_expected[index])
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
index ced0d540f33ef..086c0b7ba08b2 100644
--- a/pandas/tests/io/json/test_ujson.py
+++ b/pandas/tests/io/json/test_ujson.py
@@ -757,10 +757,10 @@ def test_array_reshaped(self, shape):
def test_array_list(self):
arr_list = [
"a",
- [],
- {},
- {},
- [],
+ list(),
+ dict(),
+ dict(),
+ list(),
42,
97.8,
["a", "b"],
@@ -797,9 +797,9 @@ def test_0d_array(self):
([42, {}, "a"], TypeError, {}),
([42, ["a"], 42], ValueError, {}),
(["a", "b", [], "c"], ValueError, {}),
- ([{"a": "b"}], ValueError, {"labelled": True}),
- ({"a": {"b": {"c": 42}}}, ValueError, {"labelled": True}),
- ([{"a": 42, "b": 23}, {"c": 17}], ValueError, {"labelled": True}),
+ ([{"a": "b"}], ValueError, dict(labelled=True)),
+ ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)),
+ ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)),
],
)
def test_array_numpy_except(self, bad_input, exc_type, kwargs):
@@ -852,8 +852,8 @@ def test_dataframe(self, orient, numpy):
columns=["x", "y", "z"],
dtype=dtype,
)
- encode_kwargs = {} if orient is None else {"orient": orient}
- decode_kwargs = {} if numpy is None else {"numpy": numpy}
+ encode_kwargs = {} if orient is None else dict(orient=orient)
+ decode_kwargs = {} if numpy is None else dict(numpy=numpy)
assert (df.dtypes == dtype).all()
output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs)
@@ -884,7 +884,7 @@ def test_dataframe_nested(self, orient):
)
nested = {"df1": df, "df2": df.copy()}
- kwargs = {} if orient is None else {"orient": orient}
+ kwargs = {} if orient is None else dict(orient=orient)
exp = {
"df1": ujson.decode(ujson.encode(df, **kwargs)),
@@ -902,7 +902,7 @@ def test_dataframe_numpy_labelled(self, orient):
columns=["x", "y", "z"],
dtype=int,
)
- kwargs = {} if orient is None else {"orient": orient}
+ kwargs = {} if orient is None else dict(orient=orient)
output = DataFrame(
*ujson.decode(ujson.encode(df, **kwargs), numpy=True, labelled=True)
@@ -925,8 +925,8 @@ def test_series(self, orient, numpy):
).sort_values()
assert s.dtype == dtype
- encode_kwargs = {} if orient is None else {"orient": orient}
- decode_kwargs = {} if numpy is None else {"numpy": numpy}
+ encode_kwargs = {} if orient is None else dict(orient=orient)
+ decode_kwargs = {} if numpy is None else dict(numpy=numpy)
output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs)
assert s.dtype == dtype
@@ -953,7 +953,7 @@ def test_series_nested(self, orient):
[10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]
).sort_values()
nested = {"s1": s, "s2": s.copy()}
- kwargs = {} if orient is None else {"orient": orient}
+ kwargs = {} if orient is None else dict(orient=orient)
exp = {
"s1": ujson.decode(ujson.encode(s, **kwargs)),
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index e8893b4c02238..d03c85f65ea8d 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -13,7 +13,7 @@ class BaseParser:
def update_kwargs(self, kwargs):
kwargs = kwargs.copy()
- kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
+ kwargs.update(dict(engine=self.engine, low_memory=self.low_memory))
return kwargs
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index 06ccfa7f62863..eee111dd4579c 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -376,10 +376,10 @@ def test_parse_trim_buffers(c_parser_only):
)
# Iterate over the CSV file in chunks of `chunksize` lines
- with parser.read_csv(
+ chunks_ = parser.read_csv(
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
- ) as chunks_:
- result = concat(chunks_, axis=0, ignore_index=True)
+ )
+ result = concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
@@ -387,14 +387,14 @@ def test_parse_trim_buffers(c_parser_only):
# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
- with parser.read_csv(
+ chunks_ = parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding="utf_8",
- ) as chunks_:
- result = concat(chunks_, axis=0, ignore_index=True)
+ )
+ result = concat(chunks_, axis=0, ignore_index=True)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index c8ed0d75b13a2..8f63d06859f62 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -195,11 +195,12 @@ def test_malformed_chunks(all_parsers, nrows):
"""
parser = all_parsers
msg = "Expected 3 fields in line 6, saw 5"
- with parser.read_csv(
+ reader = parser.read_csv(
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
- ) as reader:
- with pytest.raises(ParserError, match=msg):
- reader.read(nrows)
+ )
+
+ with pytest.raises(ParserError, match=msg):
+ reader.read(nrows)
def test_unnamed_columns(all_parsers):
@@ -470,6 +471,7 @@ def test_read_chunksize_with_index(all_parsers, index_col):
bar2,12,13,14,15
"""
+ reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2)
expected = DataFrame(
[
["foo", 2, 3, 4, 5],
@@ -483,8 +485,7 @@ def test_read_chunksize_with_index(all_parsers, index_col):
)
expected = expected.set_index("index")
- with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
- chunks = list(reader)
+ chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
@@ -504,8 +505,7 @@ def test_read_chunksize_bad(all_parsers, chunksize):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
- with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
- pass
+ parser.read_csv(StringIO(data), chunksize=chunksize)
@pytest.mark.parametrize("chunksize", [2, 8])
@@ -522,9 +522,9 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
parser = all_parsers
kwargs = dict(index_col=0, nrows=5)
+ reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs)
expected = parser.read_csv(StringIO(data), **kwargs)
- with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
- tm.assert_frame_equal(concat(reader), expected)
+ tm.assert_frame_equal(concat(reader), expected)
def test_read_chunksize_and_nrows_changing_size(all_parsers):
@@ -539,13 +539,14 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
parser = all_parsers
kwargs = dict(index_col=0, nrows=5)
+ reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs)
expected = parser.read_csv(StringIO(data), **kwargs)
- with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
- tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
- tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
- with pytest.raises(StopIteration, match=""):
- reader.get_chunk(size=3)
+ tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
+ tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
+
+ with pytest.raises(StopIteration, match=""):
+ reader.get_chunk(size=3)
def test_get_chunk_passed_chunksize(all_parsers):
@@ -556,8 +557,8 @@ def test_get_chunk_passed_chunksize(all_parsers):
7,8,9
1,2,3"""
- with parser.read_csv(StringIO(data), chunksize=2) as reader:
- result = reader.get_chunk()
+ reader = parser.read_csv(StringIO(data), chunksize=2)
+ result = reader.get_chunk()
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@@ -575,9 +576,10 @@ def test_read_chunksize_compat(all_parsers, kwargs):
bar2,12,13,14,15
"""
parser = all_parsers
+ reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs)
+
result = parser.read_csv(StringIO(data), **kwargs)
- with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
- tm.assert_frame_equal(concat(reader), result)
+ tm.assert_frame_equal(concat(reader), result)
def test_read_chunksize_jagged_names(all_parsers):
@@ -586,8 +588,9 @@ def test_read_chunksize_jagged_names(all_parsers):
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
- with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
- result = concat(reader)
+ reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4)
+
+ result = concat(reader)
tm.assert_frame_equal(result, expected)
@@ -599,8 +602,8 @@ def test_read_data_list(all_parsers):
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
expected = parser.read_csv(StringIO(data), **kwargs)
- with TextParser(data_list, chunksize=2, **kwargs) as parser:
- result = parser.read()
+ parser = TextParser(data_list, chunksize=2, **kwargs)
+ result = parser.read()
tm.assert_frame_equal(result, expected)
@@ -619,12 +622,12 @@ def test_iterator(all_parsers):
kwargs = dict(index_col=0)
expected = parser.read_csv(StringIO(data), **kwargs)
- with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
+ reader = parser.read_csv(StringIO(data), iterator=True, **kwargs)
- first_chunk = reader.read(3)
- tm.assert_frame_equal(first_chunk, expected[:3])
+ first_chunk = reader.read(3)
+ tm.assert_frame_equal(first_chunk, expected[:3])
- last_chunk = reader.read(5)
+ last_chunk = reader.read(5)
tm.assert_frame_equal(last_chunk, expected[3:])
@@ -636,8 +639,8 @@ def test_iterator2(all_parsers):
baz,7,8,9
"""
- with parser.read_csv(StringIO(data), iterator=True) as reader:
- result = list(reader)
+ reader = parser.read_csv(StringIO(data), iterator=True)
+ result = list(reader)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
@@ -660,10 +663,10 @@ def test_reader_list(all_parsers):
kwargs = dict(index_col=0)
lines = list(csv.reader(StringIO(data)))
- with TextParser(lines, chunksize=2, **kwargs) as reader:
- chunks = list(reader)
+ reader = TextParser(lines, chunksize=2, **kwargs)
expected = parser.read_csv(StringIO(data), **kwargs)
+ chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
@@ -683,10 +686,10 @@ def test_reader_list_skiprows(all_parsers):
kwargs = dict(index_col=0)
lines = list(csv.reader(StringIO(data)))
- with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
- chunks = list(reader)
+ reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs)
expected = parser.read_csv(StringIO(data), **kwargs)
+ chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[1:3])
@@ -700,8 +703,8 @@ def test_iterator_stop_on_chunksize(all_parsers):
baz,7,8,9
"""
- with parser.read_csv(StringIO(data), chunksize=1) as reader:
- result = list(reader)
+ reader = parser.read_csv(StringIO(data), chunksize=1)
+ result = list(reader)
assert len(result) == 3
expected = DataFrame(
@@ -721,8 +724,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):
data = "a\n1\n2"
with pytest.raises(ValueError, match=msg):
- with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
- pass
+ parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
def test_nrows_skipfooter_errors(all_parsers):
@@ -1360,8 +1362,7 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
data = StringIO("foo,bar\n")
if iterator:
- with parser.read_csv(data, chunksize=nrows) as reader:
- result = next(iter(reader))
+ result = next(iter(parser.read_csv(data, chunksize=nrows)))
else:
result = parser.read_csv(data, nrows=nrows)
@@ -2055,9 +2056,10 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
for i in range(1000):
f.write(str(i) + "\n")
- with parser.read_csv(path, chunksize=20) as result:
- for _ in result:
- pass
+ result = parser.read_csv(path, chunksize=20)
+
+ for _ in result:
+ pass
def test_read_csv_raises_on_header_prefix(all_parsers):
@@ -2308,35 +2310,3 @@ def test_memory_map_compression(all_parsers, compression):
parser.read_csv(path, memory_map=True, compression=compression),
expected,
)
-
-
-def test_context_manager(all_parsers, datapath):
- # make sure that opened files are closed
- parser = all_parsers
-
- path = datapath("io", "data", "csv", "iris.csv")
-
- reader = parser.read_csv(path, chunksize=1)
- assert not reader._engine.handles.handle.closed
- try:
- with reader:
- next(reader)
- assert False
- except AssertionError:
- assert reader._engine.handles.handle.closed
-
-
-def test_context_manageri_user_provided(all_parsers, datapath):
- # make sure that user-provided handles are not closed
- parser = all_parsers
-
- with open(datapath("io", "data", "csv", "iris.csv"), mode="r") as path:
-
- reader = parser.read_csv(path, chunksize=1)
- assert not reader._engine.handles.handle.closed
- try:
- with reader:
- next(reader)
- assert False
- except AssertionError:
- assert not reader._engine.handles.handle.closed
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
index 220d9474c6dbf..690d3133dae5e 100644
--- a/pandas/tests/io/parser/test_compression.py
+++ b/pandas/tests/io/parser/test_compression.py
@@ -109,7 +109,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename):
def test_infer_compression(all_parsers, csv1, buffer, ext):
# see gh-9770
parser = all_parsers
- kwargs = {"index_col": 0, "parse_dates": True}
+ kwargs = dict(index_col=0, parse_dates=True)
expected = parser.read_csv(csv1, **kwargs)
kwargs["compression"] = "infer"
@@ -144,7 +144,7 @@ def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
def test_invalid_compression(all_parsers, invalid_compression):
parser = all_parsers
- compress_kwargs = {"compression": invalid_compression}
+ compress_kwargs = dict(compression=invalid_compression)
msg = f"Unrecognized compression type: {invalid_compression}"
diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
index 1d2fb7fddc9dd..88b400d9a11df 100644
--- a/pandas/tests/io/parser/test_converters.py
+++ b/pandas/tests/io/parser/test_converters.py
@@ -57,7 +57,7 @@ def test_converters_no_implicit_conv(all_parsers):
def test_converters_euro_decimal_format(all_parsers):
# see gh-583
- converters = {}
+ converters = dict()
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 1e68e54b413b0..861aeba60cab7 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -213,11 +213,10 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers):
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
- with parser.read_csv(
- StringIO(data), dtype={"b": "category"}, chunksize=2
- ) as actuals:
- for actual, expected in zip(actuals, expecteds):
- tm.assert_frame_equal(actual, expected)
+ actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
+
+ for actual, expected in zip(actuals, expecteds):
+ tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
@@ -236,9 +235,10 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
),
]
dtype = CategoricalDtype(cats)
- with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
- for actual, expected in zip(actuals, expecteds):
- tm.assert_frame_equal(actual, expected)
+ actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
+
+ for actual, expected in zip(actuals, expecteds):
+ tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("ordered", [False, True])
@@ -495,7 +495,7 @@ def test_dtype_with_converters(all_parsers):
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])),
(
- {"a": "category", "b": "category"},
+ dict(a="category", b="category"),
DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
),
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
@@ -510,7 +510,7 @@ def test_dtype_with_converters(all_parsers):
),
),
(
- {"a": np.int64, "b": np.int32},
+ dict(a=np.int64, b=np.int32),
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
index=[],
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
index 457a6567febab..5c4e642115798 100644
--- a/pandas/tests/io/parser/test_mangle_dupes.py
+++ b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -11,7 +11,7 @@
import pandas._testing as tm
-@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
+@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
def test_basic(all_parsers, kwargs):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py
index 123dce2048a44..d50560c684084 100644
--- a/pandas/tests/io/parser/test_multi_thread.py
+++ b/pandas/tests/io/parser/test_multi_thread.py
@@ -2,7 +2,6 @@
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
-from contextlib import ExitStack
from io import BytesIO
from multiprocessing.pool import ThreadPool
@@ -47,18 +46,16 @@ def test_multi_thread_string_io_read_csv(all_parsers):
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
for _ in range(num_files)
]
+ files = [BytesIO(b) for b in bytes_to_df]
# Read all files in many threads.
- with ExitStack() as stack:
- files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
+ pool = ThreadPool(8)
- pool = stack.enter_context(ThreadPool(8))
+ results = pool.map(parser.read_csv, files)
+ first_result = results[0]
- results = pool.map(parser.read_csv, files)
- first_result = results[0]
-
- for result in results:
- tm.assert_frame_equal(first_result, result)
+ for result in results:
+ tm.assert_frame_equal(first_result, result)
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
@@ -119,8 +116,8 @@ def reader(arg):
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
]
- with ThreadPool(processes=num_tasks) as pool:
- results = pool.map(reader, tasks)
+ pool = ThreadPool(processes=num_tasks)
+ results = pool.map(reader, tasks)
header = results[0].columns
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index 97f82b9a01a9a..b8b03cbd14a1d 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -122,45 +122,41 @@ def test_parse_public_s3_bucket_chunked(self, tips_df, s3so):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
- with read_csv(
+ df_reader = read_csv(
"s3://pandas-test/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
storage_options=s3so,
- ) as df_reader:
- assert df_reader.chunksize == chunksize
- for i_chunk in [0, 1, 2]:
- # Read a couple of chunks and make sure we see them
- # properly.
- df = df_reader.get_chunk()
- assert isinstance(df, DataFrame)
- assert not df.empty
- true_df = tips_df.iloc[
- chunksize * i_chunk : chunksize * (i_chunk + 1)
- ]
- tm.assert_frame_equal(true_df, df)
+ )
+ assert df_reader.chunksize == chunksize
+ for i_chunk in [0, 1, 2]:
+ # Read a couple of chunks and make sure we see them
+ # properly.
+ df = df_reader.get_chunk()
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
+ tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
- with read_csv(
+ df_reader = read_csv(
"s3://pandas-test/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
engine="python",
storage_options=s3so,
- ) as df_reader:
- assert df_reader.chunksize == chunksize
- for i_chunk in [0, 1, 2]:
- # Read a couple of chunks and make sure we see them properly.
- df = df_reader.get_chunk()
- assert isinstance(df, DataFrame)
- assert not df.empty
- true_df = tips_df.iloc[
- chunksize * i_chunk : chunksize * (i_chunk + 1)
- ]
- tm.assert_frame_equal(true_df, df)
+ )
+ assert df_reader.chunksize == chunksize
+ for i_chunk in [0, 1, 2]:
+ # Read a couple of chunks and make sure we see them properly.
+ df = df_reader.get_chunk()
+ assert isinstance(df, DataFrame)
+ assert not df.empty
+ true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
+ tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(self, tips_df, s3so):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index a20ca508ebbfe..7a5203ca86520 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1020,13 +1020,13 @@ def test_multiple_date_cols_chunked(all_parsers):
)
expected = expected.set_index("nominal")
- with parser.read_csv(
+ reader = parser.read_csv(
StringIO(data),
parse_dates={"nominal": [1, 2]},
index_col="nominal",
chunksize=2,
- ) as reader:
- chunks = list(reader)
+ )
+ chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
index 1af69785c7584..413b78a52ad38 100644
--- a/pandas/tests/io/parser/test_textreader.py
+++ b/pandas/tests/io/parser/test_textreader.py
@@ -336,10 +336,8 @@ def test_empty_field_eof(self):
def test_empty_csv_input(self):
# GH14867
- with read_csv(
- StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
- ) as df:
- assert isinstance(df, TextFileReader)
+ df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
+ assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index f8d283f622d4d..afd2f56efb935 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -112,7 +112,7 @@ def roundtrip(key, obj, **kwargs):
tm.assert_frame_equal(o, roundtrip("frame", o))
# table
- df = DataFrame({"A": range(5), "B": range(5)})
+ df = DataFrame(dict(A=range(5), B=range(5)))
df.to_hdf(path, "table", append=True)
result = read_hdf(path, "table", where=["index>2"])
tm.assert_frame_equal(df[df.index > 2], result)
@@ -370,7 +370,7 @@ def test_keys_ignore_hdf_softlink(self, setup_path):
with ensure_clean_store(setup_path) as store:
- df = DataFrame({"A": range(5), "B": range(5)})
+ df = DataFrame(dict(A=range(5), B=range(5)))
store.put("df", df)
assert store.keys() == ["/df"]
@@ -1081,7 +1081,7 @@ def check(format, index):
def test_encoding(self, setup_path):
with ensure_clean_store(setup_path) as store:
- df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
+ df = DataFrame(dict(A="foo", B="bar"), index=range(5))
df.loc[2, "A"] = np.nan
df.loc[3, "B"] = np.nan
_maybe_remove(store, "df")
@@ -1458,7 +1458,7 @@ def check_col(key, name, size):
store.get_storer(key).table.description, name
).itemsize, size
- df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
+ df = DataFrame(dict(A="foo", B="bar"), index=range(10))
# a min_itemsize that creates a data_column
_maybe_remove(store, "df")
@@ -1631,13 +1631,16 @@ def check_col(key, name, size):
& (df_new.A > 0)
& (df_new.B < 0)
]
- tm.assert_frame_equal(result, expected, check_freq=False)
- # FIXME: 2020-05-07 freq check randomly fails in the CI
+ tm.assert_frame_equal(
+ result, expected, check_index_type=False, check_freq=False
+ )
# yield an empty frame
result = store.select("df", "string='foo' and string2='cool'")
expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
- tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(
+ result, expected, check_index_type=False, check_freq=False
+ )
with ensure_clean_store(setup_path) as store:
# doc example
@@ -1657,11 +1660,16 @@ def check_col(key, name, size):
result = store.select("df_dc", "B>0")
expected = df_dc[df_dc.B > 0]
- tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(
+ result, expected, check_index_type=False, check_freq=False
+ )
result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
- tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(
+ result, expected, check_index_type=False, check_freq=False
+ )
+ # FIXME: 2020-05-07 freq check randomly fails in the CI
with ensure_clean_store(setup_path) as store:
# doc example part 2
@@ -2188,13 +2196,13 @@ def test_append_with_timedelta(self, setup_path):
# append timedelta
df = DataFrame(
- {
- "A": Timestamp("20130101"),
- "B": [
+ dict(
+ A=Timestamp("20130101"),
+ B=[
Timestamp("20130101") + timedelta(days=i, seconds=10)
for i in range(10)
],
- }
+ )
)
df["C"] = df["A"] - df["B"]
df.loc[3:5, "C"] = np.nan
@@ -2366,7 +2374,9 @@ def test_series(self, setup_path):
ts3 = Series(
ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)
)
- self._check_roundtrip(ts3, tm.assert_series_equal, path=setup_path)
+ self._check_roundtrip(
+ ts3, tm.assert_series_equal, path=setup_path, check_index_type=False
+ )
def test_float_index(self, setup_path):
@@ -2732,10 +2742,7 @@ def test_select_dtypes(self, setup_path):
with ensure_clean_store(setup_path) as store:
# with a Timestamp data column (GH #2637)
df = DataFrame(
- {
- "ts": bdate_range("2012-01-01", periods=300),
- "A": np.random.randn(300),
- }
+ dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300))
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A"])
@@ -2763,7 +2770,7 @@ def test_select_dtypes(self, setup_path):
tm.assert_frame_equal(expected, result)
# integer index
- df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
+ df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
_maybe_remove(store, "df_int")
store.append("df_int", df)
result = store.select("df_int", "index<10 and columns=['A']")
@@ -2772,11 +2779,11 @@ def test_select_dtypes(self, setup_path):
# float index
df = DataFrame(
- {
- "A": np.random.rand(20),
- "B": np.random.rand(20),
- "index": np.arange(20, dtype="f8"),
- }
+ dict(
+ A=np.random.rand(20),
+ B=np.random.rand(20),
+ index=np.arange(20, dtype="f8"),
+ )
)
_maybe_remove(store, "df_float")
store.append("df_float", df)
@@ -2787,7 +2794,7 @@ def test_select_dtypes(self, setup_path):
with ensure_clean_store(setup_path) as store:
# floats w/o NaN
- df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
+ df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
store.append("df1", df, data_columns=True)
@@ -2811,7 +2818,7 @@ def test_select_dtypes(self, setup_path):
# tm.assert_frame_equal(expected, result)
# not in first position float with NaN ok too
- df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
+ df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[1] = np.nan
@@ -2838,15 +2845,15 @@ def test_select_with_many_inputs(self, setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
- {
- "ts": bdate_range("2012-01-01", periods=300),
- "A": np.random.randn(300),
- "B": range(300),
- "users": ["a"] * 50
+ dict(
+ ts=bdate_range("2012-01-01", periods=300),
+ A=np.random.randn(300),
+ B=range(300),
+ users=["a"] * 50
+ ["b"] * 50
+ ["c"] * 100
+ [f"a{i:03d}" for i in range(100)],
- }
+ )
)
_maybe_remove(store, "df")
store.append("df", df, data_columns=["ts", "A", "B", "users"])
@@ -3142,7 +3149,7 @@ def test_retain_index_attributes(self, setup_path):
# GH 3499, losing frequency info on index recreation
df = DataFrame(
- {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))}
+ dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H")))
)
with ensure_clean_store(setup_path) as store:
@@ -3161,11 +3168,11 @@ def test_retain_index_attributes(self, setup_path):
# try to append a table with a different frequency
with catch_warnings(record=True):
df2 = DataFrame(
- {
- "A": Series(
+ dict(
+ A=Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
- }
+ )
)
store.append("data", df2)
@@ -3174,8 +3181,8 @@ def test_retain_index_attributes(self, setup_path):
# this is ok
_maybe_remove(store, "df2")
df2 = DataFrame(
- {
- "A": Series(
+ dict(
+ A=Series(
range(3),
index=[
Timestamp("20010101"),
@@ -3183,15 +3190,15 @@ def test_retain_index_attributes(self, setup_path):
Timestamp("20020101"),
],
)
- }
+ )
)
store.append("df2", df2)
df3 = DataFrame(
- {
- "A": Series(
+ dict(
+ A=Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
- }
+ )
)
store.append("df2", df3)
@@ -3204,26 +3211,25 @@ def test_retain_index_attributes2(self, setup_path):
with catch_warnings(record=True):
df = DataFrame(
- {
- "A": Series(
+ dict(
+ A=Series(
range(3), index=date_range("2000-1-1", periods=3, freq="H")
)
- }
+ )
)
df.to_hdf(path, "data", mode="w", append=True)
df2 = DataFrame(
- {
- "A": Series(
+ dict(
+ A=Series(
range(3), index=date_range("2002-1-1", periods=3, freq="D")
)
- }
+ )
)
-
df2.to_hdf(path, "data", append=True)
idx = date_range("2000-1-1", periods=3, freq="H")
idx.name = "foo"
- df = DataFrame({"A": Series(range(3), index=idx)})
+ df = DataFrame(dict(A=Series(range(3), index=idx)))
df.to_hdf(path, "data", mode="w", append=True)
assert read_hdf(path, "data").index.name == "foo"
@@ -3232,7 +3238,7 @@ def test_retain_index_attributes2(self, setup_path):
idx2 = date_range("2001-1-1", periods=3, freq="H")
idx2.name = "bar"
- df2 = DataFrame({"A": Series(range(3), index=idx2)})
+ df2 = DataFrame(dict(A=Series(range(3), index=idx2)))
df2.to_hdf(path, "data", append=True)
assert read_hdf(path, "data").index.name is None
@@ -3533,7 +3539,7 @@ def test_coordinates(self, setup_path):
# get coordinates back & test vs frame
_maybe_remove(store, "df")
- df = DataFrame({"A": range(5), "B": range(5)})
+ df = DataFrame(dict(A=range(5), B=range(5)))
store.append("df", df)
c = store.select_as_coordinates("df", ["index<3"])
assert (c.values == np.arange(3)).all()
@@ -3795,12 +3801,12 @@ def test_nan_selection_bug_4858(self, setup_path):
with ensure_clean_store(setup_path) as store:
- df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
+ df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64")
df["cols"] = (df["cols"] + 10).apply(str)
df.iloc[0] = np.nan
expected = DataFrame(
- {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
+ dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]),
index=[3, 4, 5],
)
@@ -3814,7 +3820,7 @@ def test_start_stop_table(self, setup_path):
with ensure_clean_store(setup_path) as store:
# table
- df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
+ df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20)))
store.append("df", df)
result = store.select("df", "columns=['A']", start=0, stop=5)
@@ -3849,7 +3855,7 @@ def test_start_stop_fixed(self, setup_path):
# fixed, GH 8287
df = DataFrame(
- {"A": np.random.rand(20), "B": np.random.rand(20)},
+ dict(A=np.random.rand(20), B=np.random.rand(20)),
index=pd.date_range("20130101", periods=20),
)
store.put("df", df)
@@ -4482,7 +4488,7 @@ def test_categorical_conversion(self, setup_path):
data = [4.3, 9.8]
# Test without categories
- df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
+ df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
index cca62c5af59a1..9de6ca75fd4d9 100644
--- a/pandas/tests/io/sas/test_sas7bdat.py
+++ b/pandas/tests/io/sas/test_sas7bdat.py
@@ -52,22 +52,24 @@ def test_from_buffer(self):
with open(fname, "rb") as f:
byts = f.read()
buf = io.BytesIO(byts)
- with pd.read_sas(
+ rdr = pd.read_sas(
buf, format="sas7bdat", iterator=True, encoding="utf-8"
- ) as rdr:
- df = rdr.read()
+ )
+ df = rdr.read()
tm.assert_frame_equal(df, df0, check_exact=False)
+ rdr.close()
def test_from_iterator(self):
for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(self.dirpath, f"test{k}.sas7bdat")
- with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
- df = rdr.read(2)
- tm.assert_frame_equal(df, df0.iloc[0:2, :])
- df = rdr.read(3)
- tm.assert_frame_equal(df, df0.iloc[2:5, :])
+ rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+ df = rdr.read(2)
+ tm.assert_frame_equal(df, df0.iloc[0:2, :])
+ df = rdr.read(3)
+ tm.assert_frame_equal(df, df0.iloc[2:5, :])
+ rdr.close()
def test_path_pathlib(self):
for j in 0, 1:
@@ -94,24 +96,25 @@ def test_iterator_loop(self):
for k in self.test_ix[j]:
for chunksize in 3, 5, 10, 11:
fname = os.path.join(self.dirpath, f"test{k}.sas7bdat")
- with pd.read_sas(fname, chunksize=10, encoding="utf-8") as rdr:
- y = 0
- for x in rdr:
- y += x.shape[0]
+ rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8")
+ y = 0
+ for x in rdr:
+ y += x.shape[0]
assert y == rdr.row_count
+ rdr.close()
def test_iterator_read_too_much(self):
# github #14734
k = self.test_ix[0][0]
fname = os.path.join(self.dirpath, f"test{k}.sas7bdat")
- with pd.read_sas(
- fname, format="sas7bdat", iterator=True, encoding="utf-8"
- ) as rdr:
- d1 = rdr.read(rdr.row_count + 20)
+ rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8")
+ d1 = rdr.read(rdr.row_count + 20)
+ rdr.close()
- with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
- d2 = rdr.read(rdr.row_count + 20)
+ rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+ d2 = rdr.read(rdr.row_count + 20)
tm.assert_frame_equal(d1, d2)
+ rdr.close()
def test_encoding_options(datapath):
diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py
index a8713f5bf36c9..939edb3d8e0b4 100644
--- a/pandas/tests/io/sas/test_xport.py
+++ b/pandas/tests/io/sas/test_xport.py
@@ -47,25 +47,29 @@ def test1_basic(self):
num_rows = data.shape[0]
# Test reading beyond end of file
- with read_sas(self.file01, format="xport", iterator=True) as reader:
- data = reader.read(num_rows + 100)
+ reader = read_sas(self.file01, format="xport", iterator=True)
+ data = reader.read(num_rows + 100)
assert data.shape[0] == num_rows
+ reader.close()
# Test incremental read with `read` method.
- with read_sas(self.file01, format="xport", iterator=True) as reader:
- data = reader.read(10)
+ reader = read_sas(self.file01, format="xport", iterator=True)
+ data = reader.read(10)
+ reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
# Test incremental read with `get_chunk` method.
- with read_sas(self.file01, format="xport", chunksize=10) as reader:
- data = reader.get_chunk()
+ reader = read_sas(self.file01, format="xport", chunksize=10)
+ data = reader.get_chunk()
+ reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
# Test read in loop
m = 0
- with read_sas(self.file01, format="xport", chunksize=100) as reader:
- for x in reader:
- m += x.shape[0]
+ reader = read_sas(self.file01, format="xport", chunksize=100)
+ for x in reader:
+ m += x.shape[0]
+ reader.close()
assert m == num_rows
# Read full file with `read_sas` method
@@ -85,17 +89,15 @@ def test1_index(self):
tm.assert_frame_equal(data, data_csv, check_index_type=False)
# Test incremental read with `read` method.
- with read_sas(
- self.file01, index="SEQN", format="xport", iterator=True
- ) as reader:
- data = reader.read(10)
+ reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True)
+ data = reader.read(10)
+ reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
# Test incremental read with `get_chunk` method.
- with read_sas(
- self.file01, index="SEQN", format="xport", chunksize=10
- ) as reader:
- data = reader.get_chunk()
+ reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10)
+ data = reader.get_chunk()
+ reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
def test1_incremental(self):
@@ -105,8 +107,9 @@ def test1_incremental(self):
data_csv = data_csv.set_index("SEQN")
numeric_as_float(data_csv)
- with read_sas(self.file01, index="SEQN", chunksize=1000) as reader:
- all_data = list(reader)
+ reader = read_sas(self.file01, index="SEQN", chunksize=1000)
+
+ all_data = list(reader)
data = pd.concat(all_data, axis=0)
tm.assert_frame_equal(data, data_csv, check_index_type=False)
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index c3b21daa0ac04..c7a7101b5fe17 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -121,16 +121,16 @@ def test_get_handle_with_buffer(self):
input_buffer.close()
def test_iterator(self):
- with pd.read_csv(StringIO(self.data1), chunksize=1) as reader:
- result = pd.concat(reader, ignore_index=True)
+ reader = pd.read_csv(StringIO(self.data1), chunksize=1)
+ result = pd.concat(reader, ignore_index=True)
expected = pd.read_csv(StringIO(self.data1))
tm.assert_frame_equal(result, expected)
# GH12153
- with pd.read_csv(StringIO(self.data1), chunksize=1) as it:
- first = next(it)
- tm.assert_frame_equal(first, expected.iloc[[0]])
- tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
+ it = pd.read_csv(StringIO(self.data1), chunksize=1)
+ first = next(it)
+ tm.assert_frame_equal(first, expected.iloc[[0]])
+ tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
@pytest.mark.parametrize(
"reader, module, error_class, fn_ext",
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index ba8b1a8a0679d..9a883aac69e6b 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -14,15 +14,7 @@
from pandas.errors import ParserError
import pandas.util._test_decorators as td
-from pandas import (
- DataFrame,
- MultiIndex,
- Series,
- Timestamp,
- date_range,
- read_csv,
- to_datetime,
-)
+from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, read_csv
import pandas._testing as tm
from pandas.io.common import file_path_to_url
@@ -618,7 +610,7 @@ def try_remove_ws(x):
gtnew = ground_truth.applymap(try_remove_ws)
converted = dfnew._convert(datetime=True, numeric=True)
date_cols = ["Closing Date", "Updated Date"]
- converted[date_cols] = converted[date_cols].apply(to_datetime)
+ converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True)
tm.assert_frame_equal(converted, gtnew)
@pytest.mark.slow
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index fe3ca0d0937b3..3b83eed69c723 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -645,7 +645,7 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so):
if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
pytest.skip()
s3 = s3fs.S3FileSystem(**s3so)
- kw = {"filesystem": s3}
+ kw = dict(filesystem=s3)
check_round_trip(
df_compat,
pa,
@@ -658,7 +658,7 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so):
if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
pytest.skip()
# GH #19134
- s3so = {"storage_options": s3so}
+ s3so = dict(storage_options=s3so)
check_round_trip(
df_compat,
pa,
@@ -710,12 +710,10 @@ def test_s3_roundtrip_for_dir(
pa,
expected=expected_df,
path="s3://pandas-test/parquet_dir",
- read_kwargs={"storage_options": s3so},
- write_kwargs={
- "partition_cols": partition_col,
- "compression": None,
- "storage_options": s3so,
- },
+ read_kwargs=dict(storage_options=s3so),
+ write_kwargs=dict(
+ partition_cols=partition_col, compression=None, storage_options=s3so
+ ),
check_like=True,
repeat=1,
)
@@ -830,35 +828,6 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)
- @td.skip_if_no("pyarrow", min_version="0.16")
- def test_use_nullable_dtypes(self, pa):
- import pyarrow.parquet as pq
-
- table = pyarrow.table(
- {
- "a": pyarrow.array([1, 2, 3, None], "int64"),
- "b": pyarrow.array([1, 2, 3, None], "uint8"),
- "c": pyarrow.array(["a", "b", "c", None]),
- "d": pyarrow.array([True, False, True, None]),
- }
- )
- with tm.ensure_clean() as path:
- # write manually with pyarrow to write integers
- pq.write_table(table, path)
- result1 = read_parquet(path)
- result2 = read_parquet(path, use_nullable_dtypes=True)
-
- assert result1["a"].dtype == np.dtype("float64")
- expected = pd.DataFrame(
- {
- "a": pd.array([1, 2, 3, None], dtype="Int64"),
- "b": pd.array([1, 2, 3, None], dtype="UInt8"),
- "c": pd.array(["a", "b", "c", None], dtype="string"),
- "d": pd.array([True, False, True, None], dtype="boolean"),
- }
- )
- tm.assert_frame_equal(result2, expected)
-
@td.skip_if_no("pyarrow", min_version="0.14")
def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
@@ -948,8 +917,8 @@ def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so):
df_compat,
fp,
path="s3://pandas-test/fastparquet.parquet",
- read_kwargs={"storage_options": s3so},
- write_kwargs={"compression": None, "storage_options": s3so},
+ read_kwargs=dict(storage_options=s3so),
+ write_kwargs=dict(compression=None, storage_options=s3so),
)
def test_partition_cols_supported(self, fp, df_full):
@@ -1032,11 +1001,3 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)
-
- def test_use_nullable_dtypes_not_supported(self, fp):
- df = pd.DataFrame({"a": [1, 2]})
-
- with tm.ensure_clean() as path:
- df.to_parquet(path)
- with pytest.raises(ValueError, match="not supported for the fastparquet"):
- read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 24944281419c3..b065aa187f5fb 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1974,12 +1974,12 @@ def test_iterator_value_labels():
df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
with tm.ensure_clean() as path:
df.to_stata(path, write_index=False)
+ reader = pd.read_stata(path, chunksize=100)
expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
- with pd.read_stata(path, chunksize=100) as reader:
- for j, chunk in enumerate(reader):
- for i in range(2):
- tm.assert_index_equal(chunk.dtypes[i].categories, expected)
- tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
+ for j, chunk in enumerate(reader):
+ for i in range(2):
+ tm.assert_index_equal(chunk.dtypes[i].categories, expected)
+ tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
def test_precision_loss():
diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py
index 1f94e18d8e622..c868c8d4fba07 100644
--- a/pandas/tests/plotting/common.py
+++ b/pandas/tests/plotting/common.py
@@ -1,10 +1,3 @@
-"""
-Module consolidating common testing functions for checking plotting.
-
-Currently all plotting tests are marked as slow via
-``pytestmark = pytest.mark.slow`` at the module level.
-"""
-
import os
from typing import TYPE_CHECKING, Sequence, Union
import warnings
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index dc7478fe6ef4a..77a4c4a8faf5e 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -21,8 +21,6 @@
from pandas.io.formats.printing import pprint_thing
import pandas.plotting as plotting
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestDataFramePlots(TestPlotBase):
@@ -41,6 +39,7 @@ def setup_method(self, method):
}
)
+ @pytest.mark.slow
def test_plot(self):
from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0
@@ -172,11 +171,13 @@ def test_nonnumeric_exclude(self):
ax = df.plot()
assert len(ax.get_lines()) == 1 # B was plotted
+ @pytest.mark.slow
def test_implicit_label(self):
df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
ax = df.plot(x="a", y="b")
self._check_text_labels(ax.xaxis.get_label(), "a")
+ @pytest.mark.slow
def test_donot_overwrite_index_name(self):
# GH 8494
df = DataFrame(np.random.randn(2, 2), columns=["a", "b"])
@@ -184,6 +185,7 @@ def test_donot_overwrite_index_name(self):
df.plot(y="b", label="LABEL")
assert df.index.name == "NAME"
+ @pytest.mark.slow
def test_plot_xy(self):
# columns.inferred_type == 'string'
df = self.tdf
@@ -208,6 +210,7 @@ def test_plot_xy(self):
# columns.inferred_type == 'mixed'
# TODO add MultiIndex test
+ @pytest.mark.slow
@pytest.mark.parametrize(
"input_log, expected_log", [(True, "log"), ("sym", "symlog")]
)
@@ -236,6 +239,7 @@ def test_invalid_logscale(self, input_param):
with pytest.raises(ValueError, match=msg):
df.plot(**{input_param: "sm"})
+ @pytest.mark.slow
def test_xcompat(self):
import pandas as pd
@@ -456,28 +460,22 @@ def test_line_lim(self):
assert xmin <= lines[0].get_data()[0][0]
assert xmax >= lines[0].get_data()[0][-1]
- @pytest.mark.xfail(
- strict=False,
- reason="2020-12-01 this has been failing periodically on the "
- "ymin==0 assertion for a week or so.",
- )
- @pytest.mark.parametrize("stacked", [True, False])
- def test_area_lim(self, stacked):
+ def test_area_lim(self):
df = DataFrame(np.random.rand(6, 4), columns=["x", "y", "z", "four"])
neg_df = -df
+ for stacked in [True, False]:
+ ax = _check_plot_works(df.plot.area, stacked=stacked)
+ xmin, xmax = ax.get_xlim()
+ ymin, ymax = ax.get_ylim()
+ lines = ax.get_lines()
+ assert xmin <= lines[0].get_data()[0][0]
+ assert xmax >= lines[0].get_data()[0][-1]
+ assert ymin == 0
- ax = _check_plot_works(df.plot.area, stacked=stacked)
- xmin, xmax = ax.get_xlim()
- ymin, ymax = ax.get_ylim()
- lines = ax.get_lines()
- assert xmin <= lines[0].get_data()[0][0]
- assert xmax >= lines[0].get_data()[0][-1]
- assert ymin == 0
-
- ax = _check_plot_works(neg_df.plot.area, stacked=stacked)
- ymin, ymax = ax.get_ylim()
- assert ymax == 0
+ ax = _check_plot_works(neg_df.plot.area, stacked=stacked)
+ ymin, ymax = ax.get_ylim()
+ assert ymax == 0
def test_area_sharey_dont_overwrite(self):
# GH37942
@@ -490,6 +488,7 @@ def test_area_sharey_dont_overwrite(self):
assert ax1._shared_y_axes.joined(ax1, ax2)
assert ax2._shared_y_axes.joined(ax1, ax2)
+ @pytest.mark.slow
def test_bar_linewidth(self):
df = DataFrame(np.random.randn(5, 5))
@@ -510,6 +509,7 @@ def test_bar_linewidth(self):
for r in ax.patches:
assert r.get_linewidth() == 2
+ @pytest.mark.slow
def test_bar_barwidth(self):
df = DataFrame(np.random.randn(5, 5))
@@ -547,6 +547,7 @@ def test_bar_barwidth(self):
for r in ax.patches:
assert r.get_height() == width
+ @pytest.mark.slow
def test_bar_bottom_left(self):
df = DataFrame(np.random.rand(5, 5))
ax = df.plot.bar(stacked=False, bottom=1)
@@ -575,6 +576,7 @@ def test_bar_bottom_left(self):
result = [p.get_x() for p in ax.patches]
assert result == [1] * 5
+ @pytest.mark.slow
def test_bar_nan(self):
df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]})
ax = df.plot.bar()
@@ -590,6 +592,7 @@ def test_bar_nan(self):
expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0]
assert result == expected
+ @pytest.mark.slow
def test_bar_categorical(self):
# GH 13019
df1 = DataFrame(
@@ -619,6 +622,7 @@ def test_bar_categorical(self):
assert ax.patches[0].get_x() == -0.25
assert ax.patches[-1].get_x() == 4.75
+ @pytest.mark.slow
def test_plot_scatter(self):
df = DataFrame(
np.random.randn(6, 4),
@@ -658,23 +662,25 @@ def test_scatterplot_datetime_data(self):
def test_scatterplot_object_data(self):
# GH 18755
- df = DataFrame({"a": ["A", "B", "C"], "b": [2, 3, 4]})
+ df = DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4]))
_check_plot_works(df.plot.scatter, x="a", y="b")
_check_plot_works(df.plot.scatter, x=0, y=1)
- df = DataFrame({"a": ["A", "B", "C"], "b": ["a", "b", "c"]})
+ df = DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"]))
_check_plot_works(df.plot.scatter, x="a", y="b")
_check_plot_works(df.plot.scatter, x=0, y=1)
@pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")])
+ @pytest.mark.slow
def test_plot_scatter_with_categorical_data(self, x, y):
# after fixing GH 18755, should be able to plot categorical data
df = DataFrame({"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])})
_check_plot_works(df.plot.scatter, x=x, y=y)
+ @pytest.mark.slow
def test_plot_scatter_with_c(self):
df = DataFrame(
np.random.randn(6, 4),
@@ -733,6 +739,7 @@ def test_plot_scatter_with_s(self):
ax = df.plot.scatter(x="a", y="b", s="c")
tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes())
+ @pytest.mark.slow
def test_plot_bar(self):
df = DataFrame(
np.random.randn(6, 4),
@@ -765,6 +772,7 @@ def test_plot_bar(self):
ax = df.plot.barh(rot=55, fontsize=11)
self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11)
+ @pytest.mark.slow
def test_boxplot(self):
df = self.hist_df
series = df["height"]
@@ -793,6 +801,7 @@ def test_boxplot(self):
tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions)
assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+ @pytest.mark.slow
def test_boxplot_vertical(self):
df = self.hist_df
numeric_cols = df._get_numeric_data().columns
@@ -823,6 +832,7 @@ def test_boxplot_vertical(self):
tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions)
assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+ @pytest.mark.slow
def test_boxplot_return_type(self):
df = DataFrame(
np.random.randn(6, 4),
@@ -844,6 +854,7 @@ def test_boxplot_return_type(self):
result = df.plot.box(return_type="both")
self._check_box_return_type(result, "both")
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_df(self):
df = DataFrame(np.random.randn(100, 4))
@@ -866,12 +877,14 @@ def test_kde_df(self):
axes = df.plot(kind="kde", logy=True, subplots=True)
self._check_ax_scales(axes, yaxis="log")
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_missing_vals(self):
df = DataFrame(np.random.uniform(size=(100, 4)))
df.loc[0, 0] = np.nan
_check_plot_works(df.plot, kind="kde")
+ @pytest.mark.slow
def test_hist_df(self):
from matplotlib.patches import Rectangle
@@ -953,6 +966,7 @@ def _check_box_coord(
if expected_w is not None:
tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False)
+ @pytest.mark.slow
def test_hist_df_coord(self):
normal_df = DataFrame(
{
@@ -1084,10 +1098,12 @@ def test_hist_df_coord(self):
expected_w=np.array([6, 7, 8, 9, 10]),
)
+ @pytest.mark.slow
def test_plot_int_columns(self):
df = DataFrame(np.random.randn(100, 4)).cumsum()
_check_plot_works(df.plot, legend=True)
+ @pytest.mark.slow
def test_df_legend_labels(self):
kinds = ["line", "bar", "barh", "kde", "area", "hist"]
df = DataFrame(np.random.rand(3, 3), columns=["a", "b", "c"])
@@ -1201,6 +1217,7 @@ def test_legend_name(self):
leg_title = ax.legend_.get_title()
self._check_text_labels(leg_title, "new")
+ @pytest.mark.slow
def test_no_legend(self):
kinds = ["line", "bar", "barh", "kde", "area", "hist"]
df = DataFrame(np.random.rand(3, 3), columns=["a", "b", "c"])
@@ -1209,6 +1226,7 @@ def test_no_legend(self):
ax = df.plot(kind=kind, legend=False)
self._check_legend_labels(ax, visible=False)
+ @pytest.mark.slow
def test_style_by_column(self):
import matplotlib.pyplot as plt
@@ -1227,6 +1245,7 @@ def test_style_by_column(self):
for idx, line in enumerate(ax.get_lines()[: len(markers)]):
assert line.get_marker() == markers[idx]
+ @pytest.mark.slow
def test_line_label_none(self):
s = Series([1, 2])
ax = s.plot()
@@ -1247,7 +1266,7 @@ def test_line_label_none(self):
def test_specified_props_kwd_plot_box(self, props, expected):
# GH 30346
df = DataFrame({k: np.random.random(100) for k in "ABC"})
- kwd = {props: {"color": "C1"}}
+ kwd = {props: dict(color="C1")}
result = df.plot.box(return_type="dict", **kwd)
assert result[expected][0].get_color() == "C1"
@@ -1283,6 +1302,7 @@ def test_all_invalid_plot_data(self):
with pytest.raises(TypeError, match=msg):
df.plot(kind=kind)
+ @pytest.mark.slow
def test_partially_invalid_plot_data(self):
with tm.RNGContext(42):
df = DataFrame(np.random.randn(10, 2), dtype=object)
@@ -1352,6 +1372,7 @@ def test_xy_args_integer(self, x, y, colnames):
df.columns = colnames
_check_plot_works(df.plot, x=x, y=y)
+ @pytest.mark.slow
def test_hexbin_basic(self):
df = self.hexbin_df
@@ -1367,6 +1388,7 @@ def test_hexbin_basic(self):
# return value is single axes
self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ @pytest.mark.slow
def test_hexbin_with_c(self):
df = self.hexbin_df
@@ -1376,6 +1398,7 @@ def test_hexbin_with_c(self):
ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std)
assert len(ax.collections) == 1
+ @pytest.mark.slow
@pytest.mark.parametrize(
"kwargs, expected",
[
@@ -1389,6 +1412,7 @@ def test_hexbin_cmap(self, kwargs, expected):
ax = df.plot.hexbin(x="A", y="B", **kwargs)
assert ax.collections[0].cmap.name == expected
+ @pytest.mark.slow
def test_pie_df(self):
df = DataFrame(
np.random.rand(5, 3),
@@ -1460,6 +1484,7 @@ def test_pie_df_nan(self):
expected_labels = base_expected[:i] + base_expected[i + 1 :]
assert result_labels == expected_labels
+ @pytest.mark.slow
def test_errorbar_plot(self):
d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
df = DataFrame(d)
@@ -1506,6 +1531,7 @@ def test_errorbar_plot(self):
with pytest.raises((ValueError, TypeError)):
df.plot(yerr=df_err)
+ @pytest.mark.slow
@pytest.mark.parametrize("kind", ["line", "bar", "barh"])
def test_errorbar_plot_different_kinds(self, kind):
d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
@@ -1539,6 +1565,7 @@ def test_errorbar_plot_different_kinds(self, kind):
self._check_has_errorbars(axes, xerr=1, yerr=1)
@pytest.mark.xfail(reason="Iterator is consumed", raises=ValueError)
+ @pytest.mark.slow
def test_errorbar_plot_iterator(self):
with warnings.catch_warnings():
d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
@@ -1548,6 +1575,7 @@ def test_errorbar_plot_iterator(self):
ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df)))
self._check_has_errorbars(ax, xerr=0, yerr=2)
+ @pytest.mark.slow
def test_errorbar_with_integer_column_names(self):
# test with integer column names
df = DataFrame(np.random.randn(10, 2))
@@ -1557,6 +1585,7 @@ def test_errorbar_with_integer_column_names(self):
ax = _check_plot_works(df.plot, y=0, yerr=1)
self._check_has_errorbars(ax, xerr=0, yerr=1)
+ @pytest.mark.slow
def test_errorbar_with_partial_columns(self):
df = DataFrame(np.random.randn(10, 3))
df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2])
@@ -1579,6 +1608,7 @@ def test_errorbar_with_partial_columns(self):
ax = _check_plot_works(df.plot, yerr=err)
self._check_has_errorbars(ax, xerr=0, yerr=1)
+ @pytest.mark.slow
@pytest.mark.parametrize("kind", ["line", "bar", "barh"])
def test_errorbar_timeseries(self, kind):
d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
@@ -1683,6 +1713,7 @@ def _check_errorbar_color(containers, expected, has_err="has_xerr"):
self._check_has_errorbars(ax, xerr=0, yerr=1)
_check_errorbar_color(ax.containers, "green", has_err="has_yerr")
+ @pytest.mark.slow
def test_sharex_and_ax(self):
# https://github.com/pandas-dev/pandas/issues/9737 using gridspec,
# the axis in fig.get_axis() are sorted differently than pandas
@@ -1737,6 +1768,7 @@ def _check(axes):
self._check_visible(ax.get_xticklabels(minor=True), visible=True)
tm.close()
+ @pytest.mark.slow
def test_sharey_and_ax(self):
# https://github.com/pandas-dev/pandas/issues/9737 using gridspec,
# the axis in fig.get_axis() are sorted differently than pandas
@@ -1822,6 +1854,7 @@ def test_memory_leak(self):
# need to actually access something to get an error
results[key].lines
+ @pytest.mark.slow
def test_df_gridspec_patterns(self):
# GH 10819
import matplotlib.gridspec as gridspec
@@ -1937,6 +1970,7 @@ def _get_boxed_grid():
self._check_visible(ax.get_xticklabels(minor=True), visible=True)
tm.close()
+ @pytest.mark.slow
def test_df_grid_settings(self):
# Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
self._check_grid_settings(
@@ -1988,10 +2022,11 @@ def test_secondary_axis_font_size(self, method):
fontsize = 20
sy = ["C", "D"]
- kwargs = {"secondary_y": sy, "fontsize": fontsize, "mark_right": True}
+ kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True)
ax = getattr(df.plot, method)(**kwargs)
self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize)
+ @pytest.mark.slow
def test_x_string_values_ticks(self):
# Test if string plot index have a fixed xtick position
# GH: 7612, GH: 22334
@@ -2011,6 +2046,7 @@ def test_x_string_values_ticks(self):
assert labels_position["Tuesday"] == 1.0
assert labels_position["Wednesday"] == 2.0
+ @pytest.mark.slow
def test_x_multiindex_values_ticks(self):
# Test if multiindex plot index have a fixed xtick position
# GH: 15912
@@ -2154,6 +2190,7 @@ def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel):
assert ax.get_xlabel() == (xcol if xlabel is None else xlabel)
assert ax.get_ylabel() == (ycol if ylabel is None else ylabel)
+ @pytest.mark.slow
@pytest.mark.parametrize("method", ["bar", "barh"])
def test_bar_ticklabel_consistence(self, method):
# Draw two consecutiv bar plot with consistent ticklabels
diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py
index bc64014cdb6d4..d9fe7363a15ad 100644
--- a/pandas/tests/plotting/frame/test_frame_color.py
+++ b/pandas/tests/plotting/frame/test_frame_color.py
@@ -12,8 +12,6 @@
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestDataFrameColor(TestPlotBase):
@@ -100,6 +98,7 @@ def test_color_and_marker(self, color, expected):
assert all(i.get_linestyle() == "--" for i in ax.lines)
assert all(i.get_marker() == "d" for i in ax.lines)
+ @pytest.mark.slow
def test_bar_colors(self):
import matplotlib.pyplot as plt
@@ -153,6 +152,7 @@ def test_bar_user_colors(self):
]
assert result == expected
+ @pytest.mark.slow
def test_if_scatterplot_colorbar_affects_xaxis_visibility(self):
# addressing issue #10611, to ensure colobar does not
# interfere with x-axis label and ticklabels with
@@ -175,6 +175,7 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self):
ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible()
)
+ @pytest.mark.slow
def test_if_hexbin_xaxis_label_is_visible(self):
# addressing issue #10678, to ensure colobar does not
# interfere with x-axis label and ticklabels with
@@ -187,6 +188,7 @@ def test_if_hexbin_xaxis_label_is_visible(self):
assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels())
assert ax.xaxis.get_label().get_visible()
+ @pytest.mark.slow
def test_if_scatterplot_colorbars_are_next_to_parent_axes(self):
import matplotlib.pyplot as plt
@@ -248,6 +250,7 @@ def test_scatter_colorbar_different_cmap(self):
assert ax.collections[0].cmap.name == "cividis"
assert ax.collections[1].cmap.name == "magma"
+ @pytest.mark.slow
def test_line_colors(self):
from matplotlib import cm
@@ -292,11 +295,13 @@ def test_line_colors(self):
self._check_colors(ax.get_lines(), linecolors=custom_colors)
tm.close()
+ @pytest.mark.slow
def test_dont_modify_colors(self):
colors = ["r", "g", "b"]
DataFrame(np.random.rand(10, 2)).plot(color=colors)
assert len(colors) == 3
+ @pytest.mark.slow
def test_line_colors_and_styles_subplots(self):
# GH 9894
from matplotlib import cm
@@ -365,6 +370,7 @@ def test_line_colors_and_styles_subplots(self):
self._check_colors(ax.get_lines(), linecolors=[c])
tm.close()
+ @pytest.mark.slow
def test_area_colors(self):
from matplotlib import cm
from matplotlib.collections import PolyCollection
@@ -409,6 +415,7 @@ def test_area_colors(self):
for h in handles:
assert h.get_alpha() == 0.5
+ @pytest.mark.slow
def test_hist_colors(self):
default_colors = self._unpack_cycler(self.plt.rcParams)
@@ -443,6 +450,7 @@ def test_hist_colors(self):
self._check_colors(ax.patches[::10], facecolors=["green"] * 5)
tm.close()
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_colors(self):
from matplotlib import cm
@@ -463,6 +471,7 @@ def test_kde_colors(self):
rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))]
self._check_colors(ax.get_lines(), linecolors=rgba_colors)
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_colors_and_styles_subplots(self):
from matplotlib import cm
@@ -519,6 +528,7 @@ def test_kde_colors_and_styles_subplots(self):
self._check_colors(ax.get_lines(), linecolors=[c])
tm.close()
+ @pytest.mark.slow
def test_boxplot_colors(self):
def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None):
# TODO: outside this func?
@@ -541,12 +551,9 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None):
_check_colors(bp, default_colors[0], default_colors[0], default_colors[2])
tm.close()
- dict_colors = {
- "boxes": "#572923",
- "whiskers": "#982042",
- "medians": "#804823",
- "caps": "#123456",
- }
+ dict_colors = dict(
+ boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456"
+ )
bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict")
_check_colors(
bp,
@@ -559,7 +566,7 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None):
tm.close()
# partial colors
- dict_colors = {"whiskers": "c", "medians": "m"}
+ dict_colors = dict(whiskers="c", medians="m")
bp = df.plot.box(color=dict_colors, return_type="dict")
_check_colors(bp, default_colors[0], "c", "m")
tm.close()
@@ -587,7 +594,7 @@ def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None):
with pytest.raises(ValueError):
# Color contains invalid key results in ValueError
- df.plot.box(color={"boxes": "red", "xxxx": "blue"})
+ df.plot.box(color=dict(boxes="red", xxxx="blue"))
def test_default_color_cycle(self):
import cycler
@@ -602,11 +609,13 @@ def test_default_color_cycle(self):
expected = self._unpack_cycler(plt.rcParams)[:3]
self._check_colors(ax.get_lines(), linecolors=expected)
+ @pytest.mark.slow
def test_no_color_bar(self):
df = self.hexbin_df
ax = df.plot.hexbin(x="A", y="B", colorbar=None)
assert ax.collections[0].colorbar is None
+ @pytest.mark.slow
def test_mixing_cmap_and_colormap_raises(self):
df = self.hexbin_df
msg = "Only specify one of `cmap` and `colormap`"
diff --git a/pandas/tests/plotting/frame/test_frame_groupby.py b/pandas/tests/plotting/frame/test_frame_groupby.py
index bc35e02e6a581..9c1676d6d97fb 100644
--- a/pandas/tests/plotting/frame/test_frame_groupby.py
+++ b/pandas/tests/plotting/frame/test_frame_groupby.py
@@ -9,8 +9,6 @@
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestDataFramePlotsGroupby(TestPlotBase):
diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py
index 427b2c1c3a180..413c5b8a87dc7 100644
--- a/pandas/tests/plotting/frame/test_frame_subplots.py
+++ b/pandas/tests/plotting/frame/test_frame_subplots.py
@@ -15,8 +15,6 @@
from pandas.io.formats.printing import pprint_thing
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestDataFramePlotsSubplots(TestPlotBase):
@@ -35,6 +33,7 @@ def setup_method(self, method):
}
)
+ @pytest.mark.slow
def test_subplots(self):
df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10]))
@@ -73,6 +72,7 @@ def test_subplots(self):
for ax in axes:
assert ax.get_legend() is None
+ @pytest.mark.slow
def test_subplots_timeseries(self):
idx = date_range(start="2014-07-01", freq="M", periods=10)
df = DataFrame(np.random.rand(10, 3), index=idx)
@@ -190,6 +190,7 @@ def test_subplots_timeseries_y_axis_not_supported(self):
== testdata["datetime_mixed_tz"].values
).all()
+ @pytest.mark.slow
def test_subplots_layout_multi_column(self):
# GH 6667
df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10]))
@@ -223,6 +224,7 @@ def test_subplots_layout_multi_column(self):
with pytest.raises(ValueError):
df.plot(subplots=True, layout=(-1, -1))
+ @pytest.mark.slow
@pytest.mark.parametrize(
"kwargs, expected_axes_num, expected_layout, expected_shape",
[
@@ -244,6 +246,7 @@ def test_subplots_layout_single_column(
)
assert axes.shape == expected_shape
+ @pytest.mark.slow
def test_subplots_warnings(self):
# GH 9464
with tm.assert_produces_warning(None):
@@ -255,6 +258,7 @@ def test_subplots_warnings(self):
)
df.plot(subplots=True, layout=(3, 2))
+ @pytest.mark.slow
def test_subplots_multiple_axes(self):
# GH 5353, 6970, GH 7069
fig, axes = self.plt.subplots(2, 3)
@@ -354,6 +358,7 @@ def test_subplots_sharex_axes_existing_axes(self):
for ax in axes.ravel():
self._check_visible(ax.get_yticklabels(), visible=True)
+ @pytest.mark.slow
def test_subplots_dup_columns(self):
# GH 10962
df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa"))
@@ -375,6 +380,7 @@ def test_subplots_dup_columns(self):
assert len(ax.lines) == 0
assert len(ax.right_ax.lines) == 5
+ @pytest.mark.slow
def test_bar_log_no_subplots(self):
# GH3254, GH3298 matplotlib/matplotlib#1882, #1892
# regressions in 1.2.1
@@ -385,6 +391,7 @@ def test_bar_log_no_subplots(self):
ax = df.plot.bar(grid=True, log=True)
tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected)
+ @pytest.mark.slow
def test_bar_log_subplots(self):
expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4])
@@ -395,6 +402,7 @@ def test_bar_log_subplots(self):
tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected)
tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected)
+ @pytest.mark.slow
def test_boxplot_subplots_return_type(self):
df = self.hist_df
@@ -414,6 +422,7 @@ def test_boxplot_subplots_return_type(self):
check_ax_title=False,
)
+ @pytest.mark.slow
def test_df_subplots_patterns_minorticks(self):
# GH 10657
import matplotlib.pyplot as plt
@@ -504,37 +513,38 @@ def test_xlabel_ylabel_dataframe_subplots(
assert all(ax.get_ylabel() == str(new_label) for ax in axes)
assert all(ax.get_xlabel() == str(new_label) for ax in axes)
+ @pytest.mark.slow
@pytest.mark.parametrize(
"kwargs",
[
# stacked center
- {"kind": "bar", "stacked": True},
- {"kind": "bar", "stacked": True, "width": 0.9},
- {"kind": "barh", "stacked": True},
- {"kind": "barh", "stacked": True, "width": 0.9},
+ dict(kind="bar", stacked=True),
+ dict(kind="bar", stacked=True, width=0.9),
+ dict(kind="barh", stacked=True),
+ dict(kind="barh", stacked=True, width=0.9),
# center
- {"kind": "bar", "stacked": False},
- {"kind": "bar", "stacked": False, "width": 0.9},
- {"kind": "barh", "stacked": False},
- {"kind": "barh", "stacked": False, "width": 0.9},
+ dict(kind="bar", stacked=False),
+ dict(kind="bar", stacked=False, width=0.9),
+ dict(kind="barh", stacked=False),
+ dict(kind="barh", stacked=False, width=0.9),
# subplots center
- {"kind": "bar", "subplots": True},
- {"kind": "bar", "subplots": True, "width": 0.9},
- {"kind": "barh", "subplots": True},
- {"kind": "barh", "subplots": True, "width": 0.9},
+ dict(kind="bar", subplots=True),
+ dict(kind="bar", subplots=True, width=0.9),
+ dict(kind="barh", subplots=True),
+ dict(kind="barh", subplots=True, width=0.9),
# align edge
- {"kind": "bar", "stacked": True, "align": "edge"},
- {"kind": "bar", "stacked": True, "width": 0.9, "align": "edge"},
- {"kind": "barh", "stacked": True, "align": "edge"},
- {"kind": "barh", "stacked": True, "width": 0.9, "align": "edge"},
- {"kind": "bar", "stacked": False, "align": "edge"},
- {"kind": "bar", "stacked": False, "width": 0.9, "align": "edge"},
- {"kind": "barh", "stacked": False, "align": "edge"},
- {"kind": "barh", "stacked": False, "width": 0.9, "align": "edge"},
- {"kind": "bar", "subplots": True, "align": "edge"},
- {"kind": "bar", "subplots": True, "width": 0.9, "align": "edge"},
- {"kind": "barh", "subplots": True, "align": "edge"},
- {"kind": "barh", "subplots": True, "width": 0.9, "align": "edge"},
+ dict(kind="bar", stacked=True, align="edge"),
+ dict(kind="bar", stacked=True, width=0.9, align="edge"),
+ dict(kind="barh", stacked=True, align="edge"),
+ dict(kind="barh", stacked=True, width=0.9, align="edge"),
+ dict(kind="bar", stacked=False, align="edge"),
+ dict(kind="bar", stacked=False, width=0.9, align="edge"),
+ dict(kind="barh", stacked=False, align="edge"),
+ dict(kind="barh", stacked=False, width=0.9, align="edge"),
+ dict(kind="bar", subplots=True, align="edge"),
+ dict(kind="bar", subplots=True, width=0.9, align="edge"),
+ dict(kind="barh", subplots=True, align="edge"),
+ dict(kind="barh", subplots=True, width=0.9, align="edge"),
],
)
def test_bar_align_multiple_columns(self, kwargs):
@@ -542,21 +552,23 @@ def test_bar_align_multiple_columns(self, kwargs):
df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5))
self._check_bar_alignment(df, **kwargs)
+ @pytest.mark.slow
@pytest.mark.parametrize(
"kwargs",
[
- {"kind": "bar", "stacked": False},
- {"kind": "bar", "stacked": True},
- {"kind": "barh", "stacked": False},
- {"kind": "barh", "stacked": True},
- {"kind": "bar", "subplots": True},
- {"kind": "barh", "subplots": True},
+ dict(kind="bar", stacked=False),
+ dict(kind="bar", stacked=True),
+ dict(kind="barh", stacked=False),
+ dict(kind="barh", stacked=True),
+ dict(kind="bar", subplots=True),
+ dict(kind="barh", subplots=True),
],
)
def test_bar_align_single_column(self, kwargs):
df = DataFrame(np.random.randn(5))
self._check_bar_alignment(df, **kwargs)
+ @pytest.mark.slow
@pytest.mark.parametrize(
"kwargs",
[
@@ -572,6 +584,7 @@ def test_bar_barwidth_position(self, kwargs):
df = DataFrame(np.random.randn(5, 5))
self._check_bar_alignment(df, width=0.9, position=0.2, **kwargs)
+ @pytest.mark.slow
def test_bar_barwidth_position_int(self):
# GH 12979
df = DataFrame(np.random.randn(5, 5))
diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py
index 567d159f723a5..9025f8c361a82 100644
--- a/pandas/tests/plotting/test_backend.py
+++ b/pandas/tests/plotting/test_backend.py
@@ -12,9 +12,6 @@
setattr(dummy_backend, "plot", lambda *args, **kwargs: "used_dummy")
-pytestmark = pytest.mark.slow
-
-
@pytest.fixture
def restore_backend():
"""Restore the plotting backend to matplotlib"""
diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py
index 54a40afd019c3..9e1a8d473b9d6 100644
--- a/pandas/tests/plotting/test_boxplot_method.py
+++ b/pandas/tests/plotting/test_boxplot_method.py
@@ -1,5 +1,3 @@
-""" Test cases for .boxplot method """
-
import itertools
import string
@@ -14,11 +12,12 @@
import pandas.plotting as plotting
-pytestmark = pytest.mark.slow
+""" Test cases for .boxplot method """
@td.skip_if_no_mpl
class TestDataFramePlots(TestPlotBase):
+ @pytest.mark.slow
def test_boxplot_legacy1(self):
df = DataFrame(
np.random.randn(6, 4),
@@ -43,6 +42,7 @@ def test_boxplot_legacy1(self):
with tm.assert_produces_warning(UserWarning):
_check_plot_works(df.boxplot, by="indic", notch=1)
+ @pytest.mark.slow
def test_boxplot_legacy2(self):
df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"])
df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
@@ -77,11 +77,13 @@ def test_boxplot_legacy2(self):
lines = list(itertools.chain.from_iterable(d.values()))
assert len(ax.get_lines()) == len(lines)
+ @pytest.mark.slow
def test_boxplot_return_type_none(self):
# GH 12216; return_type=None & by=None -> axes
result = self.hist_df.boxplot()
assert isinstance(result, self.plt.Axes)
+ @pytest.mark.slow
def test_boxplot_return_type_legacy(self):
# API change in https://github.com/pandas-dev/pandas/pull/7096
import matplotlib as mpl # noqa
@@ -109,6 +111,7 @@ def test_boxplot_return_type_legacy(self):
result = df.boxplot(return_type="both")
self._check_box_return_type(result, "both")
+ @pytest.mark.slow
def test_boxplot_axis_limits(self):
def _check_ax_limits(col, ax):
y_min, y_max = ax.get_ylim()
@@ -135,11 +138,13 @@ def _check_ax_limits(col, ax):
assert age_ax._sharey == height_ax
assert dummy_ax._sharey is None
+ @pytest.mark.slow
def test_boxplot_empty_column(self):
df = DataFrame(np.random.randn(20, 4))
df.loc[:, 0] = np.nan
_check_plot_works(df.boxplot, return_type="axes")
+ @pytest.mark.slow
def test_figsize(self):
df = DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"])
result = df.boxplot(return_type="axes", figsize=(12, 8))
@@ -171,11 +176,11 @@ def test_boxplot_numeric_data(self):
"colors_kwd, expected",
[
(
- {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"},
- {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"},
+ dict(boxes="r", whiskers="b", medians="g", caps="c"),
+ dict(boxes="r", whiskers="b", medians="g", caps="c"),
),
- ({"boxes": "r"}, {"boxes": "r"}),
- ("r", {"boxes": "r", "whiskers": "r", "medians": "r", "caps": "r"}),
+ (dict(boxes="r"), dict(boxes="r")),
+ ("r", dict(boxes="r", whiskers="r", medians="r", caps="r")),
],
)
def test_color_kwd(self, colors_kwd, expected):
@@ -187,7 +192,7 @@ def test_color_kwd(self, colors_kwd, expected):
@pytest.mark.parametrize(
"dict_colors, msg",
- [({"boxes": "r", "invalid_key": "r"}, "invalid key 'invalid_key'")],
+ [(dict(boxes="r", invalid_key="r"), "invalid key 'invalid_key'")],
)
def test_color_kwd_errors(self, dict_colors, msg):
# GH: 26214
@@ -207,7 +212,7 @@ def test_color_kwd_errors(self, dict_colors, msg):
def test_specified_props_kwd(self, props, expected):
# GH 30346
df = DataFrame({k: np.random.random(100) for k in "ABC"})
- kwd = {props: {"color": "C1"}}
+ kwd = {props: dict(color="C1")}
result = df.boxplot(return_type="dict", **kwd)
assert result[expected][0].get_color() == "C1"
@@ -215,6 +220,7 @@ def test_specified_props_kwd(self, props, expected):
@td.skip_if_no_mpl
class TestDataFrameGroupByPlots(TestPlotBase):
+ @pytest.mark.slow
def test_boxplot_legacy1(self):
grouped = self.hist_df.groupby(by="gender")
with tm.assert_produces_warning(UserWarning):
@@ -223,6 +229,7 @@ def test_boxplot_legacy1(self):
axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes")
self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ @pytest.mark.slow
def test_boxplot_legacy2(self):
tuples = zip(string.ascii_letters[:10], range(10))
df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples))
@@ -234,6 +241,7 @@ def test_boxplot_legacy2(self):
axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes")
self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ @pytest.mark.slow
def test_boxplot_legacy3(self):
tuples = zip(string.ascii_letters[:10], range(10))
df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples))
@@ -244,6 +252,7 @@ def test_boxplot_legacy3(self):
axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes")
self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
+ @pytest.mark.slow
def test_grouped_plot_fignums(self):
n = 10
weight = Series(np.random.normal(166, 20, size=n))
@@ -267,6 +276,7 @@ def test_grouped_plot_fignums(self):
res = df.groupby("gender").hist()
tm.close()
+ @pytest.mark.slow
def test_grouped_box_return_type(self):
df = self.hist_df
@@ -301,6 +311,7 @@ def test_grouped_box_return_type(self):
returned = df2.boxplot(by="category", return_type=t)
self._check_box_return_type(returned, t, expected_keys=columns2)
+ @pytest.mark.slow
def test_grouped_box_layout(self):
df = self.hist_df
@@ -394,6 +405,7 @@ def test_grouped_box_layout(self):
)
self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3))
+ @pytest.mark.slow
def test_grouped_box_multiple_axes(self):
# GH 6970, GH 7069
df = self.hist_df
diff --git a/pandas/tests/plotting/test_common.py b/pandas/tests/plotting/test_common.py
index 2664dc8e1b090..af67ed7ec215b 100644
--- a/pandas/tests/plotting/test_common.py
+++ b/pandas/tests/plotting/test_common.py
@@ -5,8 +5,6 @@
from pandas import DataFrame
from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestCommon(TestPlotBase):
diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py
index ae14318cdaa49..583ed040c20d5 100644
--- a/pandas/tests/plotting/test_converter.py
+++ b/pandas/tests/plotting/test_converter.py
@@ -31,9 +31,6 @@
dates = pytest.importorskip("matplotlib.dates")
-pytestmark = pytest.mark.slow
-
-
def test_registry_mpl_resets():
# Check that Matplotlib converters are properly reset (see issue #27481)
code = (
diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py
index 397a064f6adad..590758bc01fbb 100644
--- a/pandas/tests/plotting/test_datetimelike.py
+++ b/pandas/tests/plotting/test_datetimelike.py
@@ -18,8 +18,6 @@
from pandas.tseries.offsets import WeekOfMonth
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestTSPlot(TestPlotBase):
@@ -45,6 +43,7 @@ def setup_method(self, method):
def teardown_method(self, method):
tm.close()
+ @pytest.mark.slow
def test_ts_plot_with_tz(self, tz_aware_fixture):
# GH2877, GH17173, GH31205, GH31580
tz = tz_aware_fixture
@@ -66,6 +65,7 @@ def test_fontsize_set_correctly(self):
for label in ax.get_xticklabels() + ax.get_yticklabels():
assert label.get_fontsize() == 2
+ @pytest.mark.slow
def test_frame_inferred(self):
# inferred freq
idx = date_range("1/1/1987", freq="MS", periods=100)
@@ -105,6 +105,7 @@ def test_nonnumeric_exclude(self):
with pytest.raises(TypeError, match=msg):
df["A"].plot()
+ @pytest.mark.slow
def test_tsplot(self):
_, ax = self.plt.subplots()
@@ -136,6 +137,7 @@ def test_both_style_and_color(self):
with pytest.raises(ValueError, match=msg):
s.plot(style="b-", color="#000099")
+ @pytest.mark.slow
def test_high_freq(self):
freaks = ["ms", "us"]
for freq in freaks:
@@ -152,6 +154,7 @@ def test_get_datevalue(self):
assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal
assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal
+ @pytest.mark.slow
def test_ts_plot_format_coord(self):
def check_format_of_first_point(ax, expected_string):
first_line = ax.get_lines()[0]
@@ -176,10 +179,12 @@ def check_format_of_first_point(ax, expected_string):
check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000")
tm.close()
+ @pytest.mark.slow
def test_line_plot_period_series(self):
for s in self.period_ser:
_check_plot_works(s.plot, s.index.freq)
+ @pytest.mark.slow
@pytest.mark.parametrize(
"frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"]
)
@@ -190,14 +195,17 @@ def test_line_plot_period_mlt_series(self, frqncy):
s = Series(np.random.randn(len(idx)), idx)
_check_plot_works(s.plot, s.index.freq.rule_code)
+ @pytest.mark.slow
def test_line_plot_datetime_series(self):
for s in self.datetime_ser:
_check_plot_works(s.plot, s.index.freq.rule_code)
+ @pytest.mark.slow
def test_line_plot_period_frame(self):
for df in self.period_df:
_check_plot_works(df.plot, df.index.freq)
+ @pytest.mark.slow
@pytest.mark.parametrize(
"frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"]
)
@@ -210,11 +218,13 @@ def test_line_plot_period_mlt_frame(self, frqncy):
freq = df.index.asfreq(df.index.freq.rule_code).freq
_check_plot_works(df.plot, freq)
+ @pytest.mark.slow
def test_line_plot_datetime_frame(self):
for df in self.datetime_df:
freq = df.index.to_period(df.index.freq.rule_code).freq
_check_plot_works(df.plot, freq)
+ @pytest.mark.slow
def test_line_plot_inferred_freq(self):
for ser in self.datetime_ser:
ser = Series(ser.values, Index(np.asarray(ser.index)))
@@ -231,6 +241,7 @@ def test_fake_inferred_business(self):
ts.plot(ax=ax)
assert not hasattr(ax, "freq")
+ @pytest.mark.slow
def test_plot_offset_freq(self):
ser = tm.makeTimeSeries()
_check_plot_works(ser.plot)
@@ -239,11 +250,13 @@ def test_plot_offset_freq(self):
ser = Series(np.random.randn(len(dr)), index=dr)
_check_plot_works(ser.plot)
+ @pytest.mark.slow
def test_plot_multiple_inferred_freq(self):
dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)])
ser = Series(np.random.randn(len(dr)), index=dr)
_check_plot_works(ser.plot)
+ @pytest.mark.slow
def test_uhf(self):
import pandas.plotting._matplotlib.converter as conv
@@ -262,6 +275,7 @@ def test_uhf(self):
if len(rs):
assert xp == rs
+ @pytest.mark.slow
def test_irreg_hf(self):
idx = date_range("2012-6-22 21:59:51", freq="S", periods=100)
df = DataFrame(np.random.randn(len(idx), 2), index=idx)
@@ -308,6 +322,7 @@ def test_business_freq(self):
idx = ax.get_lines()[0].get_xdata()
assert PeriodIndex(data=idx).freqstr == "B"
+ @pytest.mark.slow
def test_business_freq_convert(self):
bts = tm.makeTimeSeries(300).asfreq("BM")
ts = bts.to_period("M")
@@ -345,6 +360,7 @@ def test_dataframe(self):
idx = ax.get_lines()[0].get_xdata()
tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx))
+ @pytest.mark.slow
def test_axis_limits(self):
def _test(ax):
xlim = ax.get_xlim()
@@ -395,6 +411,7 @@ def test_get_finder(self):
assert conv.get_finder(to_offset("A")) == conv._annual_finder
assert conv.get_finder(to_offset("W")) == conv._daily_finder
+ @pytest.mark.slow
def test_finder_daily(self):
day_lst = [10, 40, 252, 400, 950, 2750, 10000]
@@ -417,6 +434,7 @@ def test_finder_daily(self):
assert rs1 == xpl1
assert rs2 == xpl2
+ @pytest.mark.slow
def test_finder_quarterly(self):
yrs = [3.5, 11]
@@ -439,6 +457,7 @@ def test_finder_quarterly(self):
assert rs1 == xpl1
assert rs2 == xpl2
+ @pytest.mark.slow
def test_finder_monthly(self):
yrs = [1.15, 2.5, 4, 11]
@@ -471,6 +490,7 @@ def test_finder_monthly_long(self):
xp = Period("1989Q1", "M").ordinal
assert rs == xp
+ @pytest.mark.slow
def test_finder_annual(self):
xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170]
xp = [Period(x, freq="A").ordinal for x in xp]
@@ -486,6 +506,7 @@ def test_finder_annual(self):
assert rs == xp
+ @pytest.mark.slow
def test_finder_minutely(self):
nminutes = 50 * 24 * 60
rng = date_range("1/1/1999", freq="Min", periods=nminutes)
@@ -510,6 +531,7 @@ def test_finder_hourly(self):
assert rs == xp
+ @pytest.mark.slow
def test_gaps(self):
ts = tm.makeTimeSeries()
ts[5:25] = np.nan
@@ -564,6 +586,7 @@ def test_gaps(self):
mask = data.mask
assert mask[2:5, 1].all()
+ @pytest.mark.slow
def test_gap_upsample(self):
low = tm.makeTimeSeries()
low[5:25] = np.nan
@@ -586,6 +609,7 @@ def test_gap_upsample(self):
mask = data.mask
assert mask[5:25, 1].all()
+ @pytest.mark.slow
def test_secondary_y(self):
ser = Series(np.random.randn(10))
ser2 = Series(np.random.randn(10))
@@ -614,6 +638,7 @@ def test_secondary_y(self):
assert hasattr(ax2, "left_ax")
assert not hasattr(ax2, "right_ax")
+ @pytest.mark.slow
def test_secondary_y_ts(self):
idx = date_range("1/1/2000", periods=10)
ser = Series(np.random.randn(10), idx)
@@ -639,6 +664,7 @@ def test_secondary_y_ts(self):
ax2 = ser.plot(secondary_y=True)
assert ax.get_yaxis().get_visible()
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_secondary_kde(self):
@@ -650,6 +676,7 @@ def test_secondary_kde(self):
axes = fig.get_axes()
assert axes[1].get_yaxis().get_ticks_position() == "right"
+ @pytest.mark.slow
def test_secondary_bar(self):
ser = Series(np.random.randn(10))
fig, ax = self.plt.subplots()
@@ -657,6 +684,7 @@ def test_secondary_bar(self):
axes = fig.get_axes()
assert axes[1].get_yaxis().get_ticks_position() == "right"
+ @pytest.mark.slow
def test_secondary_frame(self):
df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"])
axes = df.plot(secondary_y=["a", "c"], subplots=True)
@@ -664,6 +692,7 @@ def test_secondary_frame(self):
assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position
assert axes[2].get_yaxis().get_ticks_position() == "right"
+ @pytest.mark.slow
def test_secondary_bar_frame(self):
df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"])
axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True)
@@ -693,6 +722,7 @@ def test_mixed_freq_regular_first(self):
assert left <= pidx[0].ordinal
assert right >= pidx[-1].ordinal
+ @pytest.mark.slow
def test_mixed_freq_irregular_first(self):
s1 = tm.makeTimeSeries()
s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]]
@@ -723,6 +753,7 @@ def test_mixed_freq_regular_first_df(self):
assert left <= pidx[0].ordinal
assert right >= pidx[-1].ordinal
+ @pytest.mark.slow
def test_mixed_freq_irregular_first_df(self):
# GH 9852
s1 = tm.makeTimeSeries().to_frame()
@@ -748,6 +779,7 @@ def test_mixed_freq_hf_first(self):
for line in ax.get_lines():
assert PeriodIndex(data=line.get_xdata()).freq == "D"
+ @pytest.mark.slow
def test_mixed_freq_alignment(self):
ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H")
ts_data = np.random.randn(12)
@@ -761,6 +793,7 @@ def test_mixed_freq_alignment(self):
assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0]
+ @pytest.mark.slow
def test_mixed_freq_lf_first(self):
idxh = date_range("1/1/1999", periods=365, freq="D")
@@ -840,6 +873,7 @@ def test_nat_handling(self):
assert s.index.min() <= Series(xdata).min()
assert Series(xdata).max() <= s.index.max()
+ @pytest.mark.slow
def test_to_weekly_resampling(self):
idxh = date_range("1/1/1999", periods=52, freq="W")
idxl = date_range("1/1/1999", periods=12, freq="M")
@@ -851,6 +885,7 @@ def test_to_weekly_resampling(self):
for line in ax.get_lines():
assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+ @pytest.mark.slow
def test_from_weekly_resampling(self):
idxh = date_range("1/1/1999", periods=52, freq="W")
idxl = date_range("1/1/1999", periods=12, freq="M")
@@ -874,6 +909,7 @@ def test_from_weekly_resampling(self):
tm.assert_numpy_array_equal(xdata, expected_h)
tm.close()
+ @pytest.mark.slow
def test_from_resampling_area_line_mixed(self):
idxh = date_range("1/1/1999", periods=52, freq="W")
idxl = date_range("1/1/1999", periods=12, freq="M")
@@ -965,6 +1001,7 @@ def test_from_resampling_area_line_mixed(self):
expected_y += low[i].values
tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y)
+ @pytest.mark.slow
def test_mixed_freq_second_millisecond(self):
# GH 7772, GH 7760
idxh = date_range("2014-07-01 09:00", freq="S", periods=50)
@@ -988,6 +1025,7 @@ def test_mixed_freq_second_millisecond(self):
for line in ax.get_lines():
assert PeriodIndex(data=line.get_xdata()).freq == "L"
+ @pytest.mark.slow
def test_irreg_dtypes(self):
# date
idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)]
@@ -1008,6 +1046,7 @@ def test_irreg_dtypes(self):
_, ax = self.plt.subplots()
_check_plot_works(df.plot, ax=ax)
+ @pytest.mark.slow
def test_time(self):
t = datetime(1, 1, 1, 3, 30, 0)
deltas = np.random.randint(1, 20, 3).cumsum()
@@ -1032,6 +1071,7 @@ def test_time(self):
xp = time(h, m, s).strftime("%H:%M")
assert xp == rs
+ @pytest.mark.slow
def test_time_change_xlim(self):
t = datetime(1, 1, 1, 3, 30, 0)
deltas = np.random.randint(1, 20, 3).cumsum()
@@ -1073,6 +1113,7 @@ def test_time_change_xlim(self):
xp = time(h, m, s).strftime("%H:%M")
assert xp == rs
+ @pytest.mark.slow
def test_time_musec(self):
t = datetime(1, 1, 1, 3, 30, 0)
deltas = np.random.randint(1, 20, 3).cumsum()
@@ -1104,6 +1145,7 @@ def test_time_musec(self):
xp = time(h, m, s, us).strftime("%H:%M")
assert xp == rs
+ @pytest.mark.slow
def test_secondary_upsample(self):
idxh = date_range("1/1/1999", periods=365, freq="D")
idxl = date_range("1/1/1999", periods=12, freq="M")
@@ -1119,6 +1161,7 @@ def test_secondary_upsample(self):
for line in ax.left_ax.get_lines():
assert PeriodIndex(line.get_xdata()).freq == "D"
+ @pytest.mark.slow
def test_secondary_legend(self):
fig = self.plt.figure()
ax = fig.add_subplot(211)
@@ -1220,6 +1263,7 @@ def test_format_date_axis(self):
if len(line.get_text()) > 0:
assert line.get_rotation() == 30
+ @pytest.mark.slow
def test_ax_plot(self):
x = date_range(start="2012-01-02", periods=10, freq="D")
y = list(range(len(x)))
@@ -1227,12 +1271,13 @@ def test_ax_plot(self):
lines = ax.plot(x, y, label="Y")
tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x)
+ @pytest.mark.slow
def test_mpl_nopandas(self):
dates = [date(2008, 12, 31), date(2009, 1, 31)]
values1 = np.arange(10.0, 11.0, 0.5)
values2 = np.arange(11.0, 12.0, 0.5)
- kw = {"fmt": "-", "lw": 4}
+ kw = dict(fmt="-", lw=4)
_, ax = self.plt.subplots()
ax.plot_date([x.toordinal() for x in dates], values1, **kw)
@@ -1245,6 +1290,7 @@ def test_mpl_nopandas(self):
exp = np.array([x.toordinal() for x in dates], dtype=np.float64)
tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp)
+ @pytest.mark.slow
def test_irregular_ts_shared_ax_xlim(self):
# GH 2960
from pandas.plotting._matplotlib.converter import DatetimeConverter
@@ -1262,6 +1308,7 @@ def test_irregular_ts_shared_ax_xlim(self):
assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax)
assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax)
+ @pytest.mark.slow
def test_secondary_y_non_ts_xlim(self):
# GH 3490 - non-timeseries with secondary y
index_1 = [1, 2, 3, 4]
@@ -1278,6 +1325,7 @@ def test_secondary_y_non_ts_xlim(self):
assert left_before >= left_after
assert right_before < right_after
+ @pytest.mark.slow
def test_secondary_y_regular_ts_xlim(self):
# GH 3490 - regular-timeseries with secondary y
index_1 = date_range(start="2000-01-01", periods=4, freq="D")
@@ -1294,6 +1342,7 @@ def test_secondary_y_regular_ts_xlim(self):
assert left_before >= left_after
assert right_before < right_after
+ @pytest.mark.slow
def test_secondary_y_mixed_freq_ts_xlim(self):
# GH 3490 - mixed frequency timeseries with secondary y
rng = date_range("2000-01-01", periods=10000, freq="min")
@@ -1309,6 +1358,7 @@ def test_secondary_y_mixed_freq_ts_xlim(self):
assert left_before == left_after
assert right_before == right_after
+ @pytest.mark.slow
def test_secondary_y_irregular_ts_xlim(self):
# GH 3490 - irregular-timeseries with secondary y
from pandas.plotting._matplotlib.converter import DatetimeConverter
@@ -1402,6 +1452,7 @@ def test_hist(self):
_, ax = self.plt.subplots()
ax.hist([x, x], weights=[w1, w2])
+ @pytest.mark.slow
def test_overlapping_datetime(self):
# GB 6608
s1 = Series(
diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py
index f73ceee577a18..7ed29507fe0f4 100644
--- a/pandas/tests/plotting/test_groupby.py
+++ b/pandas/tests/plotting/test_groupby.py
@@ -11,8 +11,6 @@
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestDataFrameGroupByPlots(TestPlotBase):
diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py
index f700b2934cd8c..ab0024559333e 100644
--- a/pandas/tests/plotting/test_hist_method.py
+++ b/pandas/tests/plotting/test_hist_method.py
@@ -9,8 +9,6 @@
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase, _check_plot_works
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestSeriesPlots(TestPlotBase):
@@ -23,6 +21,7 @@ def setup_method(self, method):
self.ts = tm.makeTimeSeries()
self.ts.name = "ts"
+ @pytest.mark.slow
def test_hist_legacy(self):
_check_plot_works(self.ts.hist)
_check_plot_works(self.ts.hist, grid=False)
@@ -46,11 +45,13 @@ def test_hist_legacy(self):
with pytest.raises(ValueError):
self.ts.hist(by=self.ts.index, figure=fig)
+ @pytest.mark.slow
def test_hist_bins_legacy(self):
df = DataFrame(np.random.randn(10, 2))
ax = df.hist(bins=2)[0][0]
assert len(ax.patches) == 2
+ @pytest.mark.slow
def test_hist_layout(self):
df = self.hist_df
with pytest.raises(ValueError):
@@ -59,6 +60,7 @@ def test_hist_layout(self):
with pytest.raises(ValueError):
df.height.hist(layout=[1, 1])
+ @pytest.mark.slow
def test_hist_layout_with_by(self):
df = self.hist_df
@@ -96,6 +98,7 @@ def test_hist_layout_with_by(self):
axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7))
self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7))
+ @pytest.mark.slow
def test_hist_no_overlap(self):
from matplotlib.pyplot import gcf, subplot
@@ -109,11 +112,13 @@ def test_hist_no_overlap(self):
axes = fig.axes
assert len(axes) == 2
+ @pytest.mark.slow
def test_hist_by_no_extra_plots(self):
df = self.hist_df
axes = df.height.hist(by=df.gender) # noqa
assert len(self.plt.get_fignums()) == 1
+ @pytest.mark.slow
def test_plot_fails_when_ax_differs_from_figure(self):
from pylab import figure
@@ -165,6 +170,7 @@ def test_hist_with_legend_raises(self, by):
@td.skip_if_no_mpl
class TestDataFramePlots(TestPlotBase):
+ @pytest.mark.slow
def test_hist_df_legacy(self):
from matplotlib.patches import Rectangle
@@ -250,6 +256,7 @@ def test_hist_df_legacy(self):
with pytest.raises(AttributeError):
ser.hist(foo="bar")
+ @pytest.mark.slow
def test_hist_non_numerical_or_datetime_raises(self):
# gh-10444, GH32590
df = DataFrame(
@@ -275,6 +282,7 @@ def test_hist_non_numerical_or_datetime_raises(self):
with pytest.raises(ValueError, match=msg):
df_o.hist()
+ @pytest.mark.slow
def test_hist_layout(self):
df = DataFrame(np.random.randn(100, 2))
df[2] = to_datetime(
@@ -313,6 +321,7 @@ def test_hist_layout(self):
with pytest.raises(ValueError):
df.hist(layout=(-1, -1))
+ @pytest.mark.slow
# GH 9351
def test_tight_layout(self):
df = DataFrame(np.random.randn(100, 2))
@@ -435,6 +444,7 @@ def test_hist_with_legend_raises(self, by, column):
@td.skip_if_no_mpl
class TestDataFrameGroupByPlots(TestPlotBase):
+ @pytest.mark.slow
def test_grouped_hist_legacy(self):
from matplotlib.patches import Rectangle
@@ -504,6 +514,7 @@ def test_grouped_hist_legacy(self):
with pytest.raises(ValueError, match=msg):
df.hist(by="C", figsize="default")
+ @pytest.mark.slow
def test_grouped_hist_legacy2(self):
n = 10
weight = Series(np.random.normal(166, 20, size=n))
@@ -517,6 +528,7 @@ def test_grouped_hist_legacy2(self):
assert len(self.plt.get_fignums()) == 2
tm.close()
+ @pytest.mark.slow
def test_grouped_hist_layout(self):
df = self.hist_df
msg = "Layout of 1x1 must be larger than required size 2"
@@ -571,6 +583,7 @@ def test_grouped_hist_layout(self):
axes = df.hist(column=["height", "weight", "category"])
self._check_axes_shape(axes, axes_num=3, layout=(2, 2))
+ @pytest.mark.slow
def test_grouped_hist_multiple_axes(self):
# GH 6970, GH 7069
df = self.hist_df
@@ -590,6 +603,7 @@ def test_grouped_hist_multiple_axes(self):
# pass different number of axes from required
axes = df.hist(column="height", ax=axes)
+ @pytest.mark.slow
def test_axis_share_x(self):
df = self.hist_df
# GH4089
@@ -603,6 +617,7 @@ def test_axis_share_x(self):
assert not ax1._shared_y_axes.joined(ax1, ax2)
assert not ax2._shared_y_axes.joined(ax1, ax2)
+ @pytest.mark.slow
def test_axis_share_y(self):
df = self.hist_df
ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True)
@@ -615,6 +630,7 @@ def test_axis_share_y(self):
assert not ax1._shared_x_axes.joined(ax1, ax2)
assert not ax2._shared_x_axes.joined(ax1, ax2)
+ @pytest.mark.slow
def test_axis_share_xy(self):
df = self.hist_df
ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True)
diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py
index 1208100ed2dce..f37d83cd0783e 100644
--- a/pandas/tests/plotting/test_misc.py
+++ b/pandas/tests/plotting/test_misc.py
@@ -11,8 +11,6 @@
import pandas.plotting as plotting
-pytestmark = pytest.mark.slow
-
@td.skip_if_mpl
def test_import_error_message():
@@ -68,6 +66,7 @@ def setup_method(self, method):
self.ts = tm.makeTimeSeries()
self.ts.name = "ts"
+ @pytest.mark.slow
def test_autocorrelation_plot(self):
from pandas.plotting import autocorrelation_plot
@@ -77,12 +76,14 @@ def test_autocorrelation_plot(self):
ax = autocorrelation_plot(self.ts, label="Test")
self._check_legend_labels(ax, labels=["Test"])
+ @pytest.mark.slow
def test_lag_plot(self):
from pandas.plotting import lag_plot
_check_plot_works(lag_plot, series=self.ts)
_check_plot_works(lag_plot, series=self.ts, lag=5)
+ @pytest.mark.slow
def test_bootstrap_plot(self):
from pandas.plotting import bootstrap_plot
@@ -126,6 +127,7 @@ def test_scatter_matrix_axis(self):
self._check_text_labels(axes0_labels, expected)
self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+ @pytest.mark.slow
def test_andrews_curves(self, iris):
from matplotlib import cm
@@ -201,6 +203,7 @@ def test_andrews_curves(self, iris):
handles, labels = ax.get_legend_handles_labels()
self._check_colors(handles, linecolors=colors)
+ @pytest.mark.slow
def test_parallel_coordinates(self, iris):
from matplotlib import cm
@@ -274,6 +277,7 @@ def test_parallel_coordinates_with_sorted_labels(self):
# labels and colors are ordered strictly increasing
assert prev[1] < nxt[1] and prev[0] < nxt[0]
+ @pytest.mark.slow
def test_radviz(self, iris):
from matplotlib import cm
@@ -306,6 +310,7 @@ def test_radviz(self, iris):
handles, labels = ax.get_legend_handles_labels()
self._check_colors(handles, facecolors=colors)
+ @pytest.mark.slow
def test_subplot_titles(self, iris):
df = iris.drop("Name", axis=1).head()
# Use the column names as the subplot titles
@@ -406,6 +411,7 @@ def test_get_standard_colors_no_appending(self):
p = df.A.plot.bar(figsize=(16, 7), color=color_list)
assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor()
+ @pytest.mark.slow
def test_dictionary_color(self):
# issue-8193
# Test plot color dictionary format
@@ -426,6 +432,7 @@ def test_dictionary_color(self):
colors = [rect.get_color() for rect in ax.get_lines()[0:2]]
assert all(color == expected[index] for index, color in enumerate(colors))
+ @pytest.mark.slow
def test_has_externally_shared_axis_x_axis(self):
# GH33819
# Test _has_externally_shared_axis() works for x-axis
@@ -451,6 +458,7 @@ def test_has_externally_shared_axis_x_axis(self):
assert func(plots[0][2], "x")
assert not func(plots[0][3], "x")
+ @pytest.mark.slow
def test_has_externally_shared_axis_y_axis(self):
# GH33819
# Test _has_externally_shared_axis() works for y-axis
@@ -476,6 +484,7 @@ def test_has_externally_shared_axis_y_axis(self):
assert func(plots[2][0], "y")
assert not func(plots[3][0], "y")
+ @pytest.mark.slow
def test_has_externally_shared_axis_invalid_compare_axis(self):
# GH33819
# Test _has_externally_shared_axis() raises an exception when
@@ -493,6 +502,7 @@ def test_has_externally_shared_axis_invalid_compare_axis(self):
with pytest.raises(ValueError, match=msg):
func(plots[0][0], "z")
+ @pytest.mark.slow
def test_externally_shared_axes(self):
# Example from GH33819
# Create data
diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py
index 9da2336fb9342..b8dd2ada87506 100644
--- a/pandas/tests/plotting/test_series.py
+++ b/pandas/tests/plotting/test_series.py
@@ -16,8 +16,6 @@
import pandas.plotting as plotting
-pytestmark = pytest.mark.slow
-
@td.skip_if_no_mpl
class TestSeriesPlots(TestPlotBase):
@@ -36,6 +34,7 @@ def setup_method(self, method):
self.iseries = tm.makePeriodSeries()
self.iseries.name = "iseries"
+ @pytest.mark.slow
def test_plot(self):
_check_plot_works(self.ts.plot, label="foo")
_check_plot_works(self.ts.plot, use_index=False)
@@ -71,6 +70,7 @@ def test_plot(self):
ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1))
self._check_axes_shape(ax, axes_num=1, layout=(1, 1))
+ @pytest.mark.slow
def test_plot_figsize_and_title(self):
# figsize and title
_, ax = self.plt.subplots()
@@ -222,6 +222,7 @@ def test_line_use_index_false(self):
label2 = ax2.get_xlabel()
assert label2 == ""
+ @pytest.mark.slow
def test_bar_log(self):
expected = np.array([1e-1, 1e0, 1e1, 1e2, 1e3, 1e4])
@@ -255,6 +256,7 @@ def test_bar_log(self):
tm.assert_almost_equal(res[1], ymax)
tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected)
+ @pytest.mark.slow
def test_bar_ignore_index(self):
df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
_, ax = self.plt.subplots()
@@ -309,6 +311,7 @@ def test_unsorted_index_xlim(self):
assert xmin <= np.nanmin(lines[0].get_data(orig=False)[0])
assert xmax >= np.nanmax(lines[0].get_data(orig=False)[0])
+ @pytest.mark.slow
def test_pie_series(self):
# if sum of values is less than 1.0, pie handle them as rate and draw
# semicircle.
@@ -365,12 +368,14 @@ def test_pie_nan(self):
result = [x.get_text() for x in ax.texts]
assert result == expected
+ @pytest.mark.slow
def test_hist_df_kwargs(self):
df = DataFrame(np.random.randn(10, 2))
_, ax = self.plt.subplots()
ax = df.plot.hist(bins=5, ax=ax)
assert len(ax.patches) == 10
+ @pytest.mark.slow
def test_hist_df_with_nonnumerics(self):
# GH 9853
with tm.RNGContext(1):
@@ -384,6 +389,7 @@ def test_hist_df_with_nonnumerics(self):
ax = df.plot.hist(ax=ax) # bins=10
assert len(ax.patches) == 40
+ @pytest.mark.slow
def test_hist_legacy(self):
_check_plot_works(self.ts.hist)
_check_plot_works(self.ts.hist, grid=False)
@@ -407,11 +413,13 @@ def test_hist_legacy(self):
with pytest.raises(ValueError):
self.ts.hist(by=self.ts.index, figure=fig)
+ @pytest.mark.slow
def test_hist_bins_legacy(self):
df = DataFrame(np.random.randn(10, 2))
ax = df.hist(bins=2)[0][0]
assert len(ax.patches) == 2
+ @pytest.mark.slow
def test_hist_layout(self):
df = self.hist_df
with pytest.raises(ValueError):
@@ -420,6 +428,7 @@ def test_hist_layout(self):
with pytest.raises(ValueError):
df.height.hist(layout=[1, 1])
+ @pytest.mark.slow
def test_hist_layout_with_by(self):
df = self.hist_df
@@ -455,6 +464,7 @@ def test_hist_layout_with_by(self):
axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7))
self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7))
+ @pytest.mark.slow
def test_hist_no_overlap(self):
from matplotlib.pyplot import gcf, subplot
@@ -468,6 +478,7 @@ def test_hist_no_overlap(self):
axes = fig.axes
assert len(axes) == 2
+ @pytest.mark.slow
def test_hist_secondary_legend(self):
# GH 9610
df = DataFrame(np.random.randn(30, 4), columns=list("abcd"))
@@ -506,6 +517,7 @@ def test_hist_secondary_legend(self):
assert ax.get_yaxis().get_visible()
tm.close()
+ @pytest.mark.slow
def test_df_series_secondary_legend(self):
# GH 9779
df = DataFrame(np.random.randn(30, 3), columns=list("abc"))
@@ -569,6 +581,7 @@ def test_df_series_secondary_legend(self):
assert ax.get_yaxis().get_visible()
tm.close()
+ @pytest.mark.slow
@pytest.mark.parametrize(
"input_logy, expected_scale", [(True, "log"), ("sym", "symlog")]
)
@@ -584,12 +597,14 @@ def test_secondary_logy(self, input_logy, expected_scale):
assert ax1.get_yscale() == expected_scale
assert ax2.get_yscale() == expected_scale
+ @pytest.mark.slow
def test_plot_fails_with_dupe_color_and_style(self):
x = Series(np.random.randn(2))
with pytest.raises(ValueError):
_, ax = self.plt.subplots()
x.plot(style="k--", color="k", ax=ax)
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_hist_kde(self):
@@ -612,6 +627,7 @@ def test_hist_kde(self):
ylabels = ax.get_yticklabels()
self._check_text_labels(ylabels, [""] * len(ylabels))
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_kwargs(self):
sample_points = np.linspace(-100, 100, 20)
@@ -625,6 +641,7 @@ def test_kde_kwargs(self):
self._check_ax_scales(ax, yaxis="log")
self._check_text_labels(ax.yaxis.get_label(), "Density")
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_kde_missing_vals(self):
s = Series(np.random.uniform(size=50))
@@ -634,6 +651,7 @@ def test_kde_missing_vals(self):
# gh-14821: check if the values have any missing values
assert any(~np.isnan(axes.lines[0].get_xdata()))
+ @pytest.mark.slow
def test_hist_kwargs(self):
_, ax = self.plt.subplots()
ax = self.ts.plot.hist(bins=5, ax=ax)
@@ -650,6 +668,7 @@ def test_hist_kwargs(self):
ax = self.ts.plot.hist(align="left", stacked=True, ax=ax)
tm.close()
+ @pytest.mark.slow
@td.skip_if_no_scipy
def test_hist_kde_color(self):
_, ax = self.plt.subplots()
@@ -665,6 +684,7 @@ def test_hist_kde_color(self):
assert len(lines) == 1
self._check_colors(lines, ["r"])
+ @pytest.mark.slow
def test_boxplot_series(self):
_, ax = self.plt.subplots()
ax = self.ts.plot.box(logy=True, ax=ax)
@@ -674,6 +694,7 @@ def test_boxplot_series(self):
ylabels = ax.get_yticklabels()
self._check_text_labels(ylabels, [""] * len(ylabels))
+ @pytest.mark.slow
def test_kind_both_ways(self):
s = Series(range(3))
kinds = (
@@ -687,6 +708,7 @@ def test_kind_both_ways(self):
getattr(s.plot, kind)()
self.plt.close()
+ @pytest.mark.slow
def test_invalid_plot_data(self):
s = Series(list("abcd"))
_, ax = self.plt.subplots()
@@ -696,6 +718,7 @@ def test_invalid_plot_data(self):
with pytest.raises(TypeError, match=msg):
s.plot(kind=kind, ax=ax)
+ @pytest.mark.slow
def test_valid_object_plot(self):
s = Series(range(10), dtype=object)
for kind in plotting.PlotAccessor._common_kinds:
@@ -715,6 +738,7 @@ def test_invalid_kind(self):
with pytest.raises(ValueError):
s.plot(kind="aasdf")
+ @pytest.mark.slow
def test_dup_datetime_index_plot(self):
dr1 = date_range("1/1/2009", periods=4)
dr2 = date_range("1/2/2009", periods=4)
@@ -743,6 +767,7 @@ def test_errorbar_asymmetrical(self):
tm.close()
+ @pytest.mark.slow
def test_errorbar_plot(self):
s = Series(np.arange(10), name="x")
@@ -788,6 +813,7 @@ def test_table(self):
_check_plot_works(self.series.plot, table=True)
_check_plot_works(self.series.plot, table=self.series)
+ @pytest.mark.slow
def test_series_grid_settings(self):
# Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
self._check_grid_settings(
@@ -795,6 +821,7 @@ def test_series_grid_settings(self):
plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds,
)
+ @pytest.mark.slow
def test_standard_colors(self):
from pandas.plotting._matplotlib.style import get_standard_colors
@@ -811,6 +838,7 @@ def test_standard_colors(self):
result = get_standard_colors(3, color=[c])
assert result == [c] * 3
+ @pytest.mark.slow
def test_standard_colors_all(self):
import matplotlib.colors as colors
diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py
index 3c48eeaccbf34..665bda15724fd 100644
--- a/pandas/tests/plotting/test_style.py
+++ b/pandas/tests/plotting/test_style.py
@@ -5,8 +5,6 @@
pytest.importorskip("matplotlib")
from pandas.plotting._matplotlib.style import get_standard_colors
-pytestmark = pytest.mark.slow
-
class TestGetStandardColors:
@pytest.mark.parametrize(
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index daa9ac531d556..3c9fab2d4090c 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -1273,7 +1273,7 @@ def test_resample_timegrouper():
dates3 = [pd.NaT] + dates1 + [pd.NaT]
for dates in [dates1, dates2, dates3]:
- df = DataFrame({"A": dates, "B": np.arange(len(dates))})
+ df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
result = df.set_index("A").resample("M").count()
exp_idx = DatetimeIndex(
["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"],
@@ -1288,9 +1288,7 @@ def test_resample_timegrouper():
result = df.groupby(Grouper(freq="M", key="A")).count()
tm.assert_frame_equal(result, expected)
- df = DataFrame(
- {"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))}
- )
+ df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates))))
result = df.set_index("A").resample("M").count()
expected = DataFrame(
{"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]},
@@ -1730,7 +1728,7 @@ def test_resample_apply_product():
index = date_range(start="2012-01-31", freq="M", periods=12)
ts = Series(range(12), index=index)
- df = DataFrame({"A": ts, "B": ts + 2})
+ df = DataFrame(dict(A=ts, B=ts + 2))
result = df.resample("Q").apply(np.product)
expected = DataFrame(
np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64),
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
index c12111e20a4b1..50e7cf9bd8eda 100644
--- a/pandas/tests/resample/test_time_grouper.py
+++ b/pandas/tests/resample/test_time_grouper.py
@@ -158,12 +158,12 @@ def test_aggregate_normal(resample_method):
@pytest.mark.parametrize(
"method, method_args, unit",
[
- ("sum", {}, 0),
- ("sum", {"min_count": 0}, 0),
- ("sum", {"min_count": 1}, np.nan),
- ("prod", {}, 1),
- ("prod", {"min_count": 0}, 1),
- ("prod", {"min_count": 1}, np.nan),
+ ("sum", dict(), 0),
+ ("sum", dict(min_count=0), 0),
+ ("sum", dict(min_count=1), np.nan),
+ ("prod", dict(), 1),
+ ("prod", dict(min_count=0), 1),
+ ("prod", dict(min_count=1), np.nan),
],
)
def test_resample_entirely_nat_window(method, method_args, unit):
@@ -267,14 +267,14 @@ def test_repr():
@pytest.mark.parametrize(
"method, method_args, expected_values",
[
- ("sum", {}, [1, 0, 1]),
- ("sum", {"min_count": 0}, [1, 0, 1]),
- ("sum", {"min_count": 1}, [1, np.nan, 1]),
- ("sum", {"min_count": 2}, [np.nan, np.nan, np.nan]),
- ("prod", {}, [1, 1, 1]),
- ("prod", {"min_count": 0}, [1, 1, 1]),
- ("prod", {"min_count": 1}, [1, np.nan, 1]),
- ("prod", {"min_count": 2}, [np.nan, np.nan, np.nan]),
+ ("sum", dict(), [1, 0, 1]),
+ ("sum", dict(min_count=0), [1, 0, 1]),
+ ("sum", dict(min_count=1), [1, np.nan, 1]),
+ ("sum", dict(min_count=2), [np.nan, np.nan, np.nan]),
+ ("prod", dict(), [1, 1, 1]),
+ ("prod", dict(min_count=0), [1, 1, 1]),
+ ("prod", dict(min_count=1), [1, np.nan, 1]),
+ ("prod", dict(min_count=2), [np.nan, np.nan, np.nan]),
],
)
def test_upsample_sum(method, method_args, expected_values):
diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py
index 6dae28003d3b6..388575c5a3b86 100644
--- a/pandas/tests/reshape/concat/test_categorical.py
+++ b/pandas/tests/reshape/concat/test_categorical.py
@@ -1,4 +1,5 @@
import numpy as np
+import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
@@ -136,18 +137,13 @@ def test_categorical_index_preserver(self):
).set_index("B")
tm.assert_frame_equal(result, expected)
- # wrong categories -> uses concat_compat, which casts to object
+ # wrong categories
df3 = DataFrame(
{"A": a, "B": Categorical(b, categories=list("abe"))}
).set_index("B")
- result = pd.concat([df2, df3])
- expected = pd.concat(
- [
- df2.set_axis(df2.index.astype(object), 0),
- df3.set_axis(df3.index.astype(object), 0),
- ]
- )
- tm.assert_frame_equal(result, expected)
+ msg = "categories must match existing categories when appending"
+ with pytest.raises(TypeError, match=msg):
+ pd.concat([df2, df3])
def test_concat_categorical_tz(self):
# GH-23816
diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py
index 44a5e7f806309..a4d6b58307523 100644
--- a/pandas/tests/reshape/concat/test_datetimes.py
+++ b/pandas/tests/reshape/concat/test_datetimes.py
@@ -373,10 +373,10 @@ def test_concat_tz_series_with_datetimelike(self):
def test_concat_tz_frame(self):
df2 = DataFrame(
- {
- "A": Timestamp("20130102", tz="US/Eastern"),
- "B": Timestamp("20130603", tz="CET"),
- },
+ dict(
+ A=Timestamp("20130102", tz="US/Eastern"),
+ B=Timestamp("20130603", tz="CET"),
+ ),
index=range(5),
)
@@ -391,20 +391,20 @@ def test_concat_multiple_tzs(self):
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="EST")
- df1 = DataFrame({"time": [ts1]})
- df2 = DataFrame({"time": [ts2]})
- df3 = DataFrame({"time": [ts3]})
+ df1 = DataFrame(dict(time=[ts1]))
+ df2 = DataFrame(dict(time=[ts2]))
+ df3 = DataFrame(dict(time=[ts3]))
results = pd.concat([df1, df2]).reset_index(drop=True)
- expected = DataFrame({"time": [ts1, ts2]}, dtype=object)
+ expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
tm.assert_frame_equal(results, expected)
results = pd.concat([df1, df3]).reset_index(drop=True)
- expected = DataFrame({"time": [ts1, ts3]}, dtype=object)
+ expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
tm.assert_frame_equal(results, expected)
results = pd.concat([df2, df3]).reset_index(drop=True)
- expected = DataFrame({"time": [ts2, ts3]})
+ expected = DataFrame(dict(time=[ts2, ts3]))
tm.assert_frame_equal(results, expected)
def test_concat_multiindex_with_tz(self):
diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py
index a97e9265b4f99..5c540124de8e6 100644
--- a/pandas/tests/reshape/concat/test_empty.py
+++ b/pandas/tests/reshape/concat/test_empty.py
@@ -26,7 +26,7 @@ def test_handle_empty_objects(self, sort):
# empty as first element with time series
# GH3259
df = DataFrame(
- {"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
+ dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s")
)
empty = DataFrame()
result = concat([df, empty], axis=1)
diff --git a/pandas/tests/reshape/concat/test_invalid.py b/pandas/tests/reshape/concat/test_invalid.py
index cc9f09c16fb43..3a886e0d612c6 100644
--- a/pandas/tests/reshape/concat/test_invalid.py
+++ b/pandas/tests/reshape/concat/test_invalid.py
@@ -12,7 +12,7 @@ def test_concat_invalid(self):
# trying to concat a ndframe with a non-ndframe
df1 = tm.makeCustomDataframe(10, 2)
- for obj in [1, {}, [1, 2], (1, 2)]:
+ for obj in [1, dict(), [1, 2], (1, 2)]:
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
@@ -45,7 +45,7 @@ def test_concat_invalid_first_argument(self):
bar2,12,13,14,15
"""
- with read_csv(StringIO(data), chunksize=1) as reader:
- result = concat(reader, ignore_index=True)
+ reader = read_csv(StringIO(data), chunksize=1)
+ result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index f43ae58fbcc2f..f44909b61ff7a 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -422,10 +422,10 @@ def test_left_merge_empty_dataframe(self):
@pytest.mark.parametrize(
"kwarg",
[
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
- {"left_on": "a", "right_index": True},
- {"left_on": "a", "right_on": "x"},
+ dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on="x"),
+ dict(left_on="a", right_index=True),
+ dict(left_on="a", right_on="x"),
],
)
def test_merge_left_empty_right_empty(self, join_type, kwarg):
@@ -475,18 +475,18 @@ def check2(exp, kwarg):
tm.assert_frame_equal(result, exp)
for kwarg in [
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
+ dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on="x"),
]:
check1(exp_in, kwarg)
check2(exp_out, kwarg)
- kwarg = {"left_on": "a", "right_index": True}
+ kwarg = dict(left_on="a", right_index=True)
check1(exp_in, kwarg)
exp_out["a"] = [0, 1, 2]
check2(exp_out, kwarg)
- kwarg = {"left_on": "a", "right_on": "x"}
+ kwarg = dict(left_on="a", right_on="x")
check1(exp_in, kwarg)
exp_out["a"] = np.array([np.nan] * 3, dtype=object)
check2(exp_out, kwarg)
@@ -524,10 +524,10 @@ def check2(exp, kwarg):
tm.assert_frame_equal(result, exp)
for kwarg in [
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
- {"left_on": "a", "right_index": True},
- {"left_on": "a", "right_on": "x"},
+ dict(left_index=True, right_index=True),
+ dict(left_index=True, right_on="x"),
+ dict(left_on="a", right_index=True),
+ dict(left_on="a", right_on="x"),
]:
check1(exp_in, kwarg)
check2(exp_out, kwarg)
@@ -753,7 +753,7 @@ def test_overlapping_columns_error_message(self):
# #2649, #10639
df2.columns = ["key1", "foo", "foo"]
- msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
+ msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)"
with pytest.raises(MergeError, match=msg):
merge(df, df2)
@@ -1999,19 +1999,19 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
@pytest.mark.parametrize(
"col1, col2, kwargs, expected_cols",
[
- (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]),
- (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]),
- (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]),
- (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]),
- ("a", 0, {"suffixes": (None, "_y")}, ["a", 0]),
- (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]),
- ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]),
- ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]),
- ("a", "b", {"suffixes": ("_x", None)}, ["a", "b"]),
- ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]),
- (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]),
- ("a", "a", {}, ["a_x", "a_y"]),
- (0, 0, {}, ["0_x", "0_y"]),
+ (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]),
+ (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]),
+ (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]),
+ (0, 0, dict(suffixes=["_x", "_y"]), ["0_x", "0_y"]),
+ ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]),
+ (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]),
+ ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
+ ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]),
+ ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]),
+ ("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]),
+ (0, 0, dict(suffixes=("_a", None)), ["0_a", 0]),
+ ("a", "a", dict(), ["a_x", "a_y"]),
+ (0, 0, dict(), ["0_x", "0_y"]),
],
)
def test_merge_suffix(col1, col2, kwargs, expected_cols):
diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py
index c3e0a92850c07..d20d93370ec7e 100644
--- a/pandas/tests/reshape/merge/test_merge_index_as_string.py
+++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py
@@ -8,22 +8,22 @@
@pytest.fixture
def df1():
return DataFrame(
- {
- "outer": [1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
- "inner": [1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
- "v1": np.linspace(0, 1, 11),
- }
+ dict(
+ outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
+ inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
+ v1=np.linspace(0, 1, 11),
+ )
)
@pytest.fixture
def df2():
return DataFrame(
- {
- "outer": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
- "inner": [1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
- "v2": np.linspace(10, 11, 12),
- }
+ dict(
+ outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
+ inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
+ v2=np.linspace(10, 11, 12),
+ )
)
diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py
index 4a70719df5c57..17f2f44f45fce 100644
--- a/pandas/tests/reshape/merge/test_merge_ordered.py
+++ b/pandas/tests/reshape/merge/test_merge_ordered.py
@@ -115,84 +115,3 @@ def test_doc_example(self):
)
tm.assert_frame_equal(result, expected)
-
- @pytest.mark.parametrize(
- "left, right, on, left_by, right_by, expected",
- [
- (
- DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
- DataFrame({"T": [2], "E": [1]}),
- ["T"],
- ["G", "H"],
- None,
- DataFrame(
- {
- "G": ["g"] * 3,
- "H": ["h"] * 3,
- "T": [1, 2, 3],
- "E": [np.nan, 1.0, np.nan],
- }
- ),
- ),
- (
- DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
- DataFrame({"T": [2], "E": [1]}),
- "T",
- ["G", "H"],
- None,
- DataFrame(
- {
- "G": ["g"] * 3,
- "H": ["h"] * 3,
- "T": [1, 2, 3],
- "E": [np.nan, 1.0, np.nan],
- }
- ),
- ),
- (
- DataFrame({"T": [2], "E": [1]}),
- DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}),
- ["T"],
- None,
- ["G", "H"],
- DataFrame(
- {
- "T": [1, 2, 3],
- "E": [np.nan, 1.0, np.nan],
- "G": ["g"] * 3,
- "H": ["h"] * 3,
- }
- ),
- ),
- ],
- )
- def test_list_type_by(self, left, right, on, left_by, right_by, expected):
- # GH 35269
- result = merge_ordered(
- left=left,
- right=right,
- on=on,
- left_by=left_by,
- right_by=right_by,
- )
-
- tm.assert_frame_equal(result, expected)
-
- def test_left_by_length_equals_to_right_shape0(self):
- # GH 38166
- left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT"))
- right = DataFrame([[2, 1]], columns=list("TE"))
- result = merge_ordered(left, right, on="T", left_by=["G", "H"])
- expected = DataFrame(
- {"G": ["g"] * 3, "H": ["h"] * 3, "T": [1, 2, 3], "E": [np.nan, 1.0, np.nan]}
- )
-
- tm.assert_frame_equal(result, expected)
-
- def test_elements_not_in_by_but_in_df(self):
- # GH 38167
- left = DataFrame([["g", "h", 1], ["g", "h", 3]], columns=list("GHT"))
- right = DataFrame([[2, 1]], columns=list("TE"))
- msg = r"\{'h'\} not found in left columns"
- with pytest.raises(KeyError, match=msg):
- merge_ordered(left, right, on="T", left_by=["G", "h"])
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
index 673c97740594f..260a0e9d486b2 100644
--- a/pandas/tests/reshape/merge/test_multi.py
+++ b/pandas/tests/reshape/merge/test_multi.py
@@ -36,13 +36,13 @@ def right():
@pytest.fixture
def left_multi():
return DataFrame(
- {
- "Origin": ["A", "A", "B", "B", "C"],
- "Destination": ["A", "B", "A", "C", "A"],
- "Period": ["AM", "AM", "IP", "AM", "OP"],
- "TripPurp": ["hbw", "nhb", "hbo", "nhb", "hbw"],
- "Trips": [1987, 3647, 2470, 4296, 4444],
- },
+ dict(
+ Origin=["A", "A", "B", "B", "C"],
+ Destination=["A", "B", "A", "C", "A"],
+ Period=["AM", "AM", "IP", "AM", "OP"],
+ TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"],
+ Trips=[1987, 3647, 2470, 4296, 4444],
+ ),
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
).set_index(["Origin", "Destination", "Period", "TripPurp"])
@@ -50,13 +50,13 @@ def left_multi():
@pytest.fixture
def right_multi():
return DataFrame(
- {
- "Origin": ["A", "A", "B", "B", "C", "C", "E"],
- "Destination": ["A", "B", "A", "B", "A", "B", "F"],
- "Period": ["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
- "LinkType": ["a", "b", "c", "b", "a", "b", "a"],
- "Distance": [100, 80, 90, 80, 75, 35, 55],
- },
+ dict(
+ Origin=["A", "A", "B", "B", "C", "C", "E"],
+ Destination=["A", "B", "A", "B", "A", "B", "F"],
+ Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
+ LinkType=["a", "b", "c", "b", "a", "b", "a"],
+ Distance=[100, 80, 90, 80, 75, 35, 55],
+ ),
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
).set_index(["Origin", "Destination", "Period", "LinkType"])
@@ -533,17 +533,17 @@ def test_join_multi_levels(self):
# GH 3662
# merge multi-levels
household = DataFrame(
- {
- "household_id": [1, 2, 3],
- "male": [0, 1, 0],
- "wealth": [196087.3, 316478.7, 294750],
- },
+ dict(
+ household_id=[1, 2, 3],
+ male=[0, 1, 0],
+ wealth=[196087.3, 316478.7, 294750],
+ ),
columns=["household_id", "male", "wealth"],
).set_index("household_id")
portfolio = DataFrame(
- {
- "household_id": [1, 2, 2, 3, 3, 3, 4],
- "asset_id": [
+ dict(
+ household_id=[1, 2, 2, 3, 3, 3, 4],
+ asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
@@ -552,7 +552,7 @@ def test_join_multi_levels(self):
"nl0000289965",
np.nan,
],
- "name": [
+ name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
@@ -561,24 +561,17 @@ def test_join_multi_levels(self):
"Postbank BioTech Fonds",
np.nan,
],
- "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
- },
+ share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
+ ),
columns=["household_id", "asset_id", "name", "share"],
).set_index(["household_id", "asset_id"])
result = household.join(portfolio, how="inner")
expected = (
DataFrame(
- {
- "male": [0, 1, 1, 0, 0, 0],
- "wealth": [
- 196087.3,
- 316478.7,
- 316478.7,
- 294750.0,
- 294750.0,
- 294750.0,
- ],
- "name": [
+ dict(
+ male=[0, 1, 1, 0, 0, 0],
+ wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0],
+ name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
@@ -586,9 +579,9 @@ def test_join_multi_levels(self):
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
],
- "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
- "household_id": [1, 2, 2, 3, 3, 3],
- "asset_id": [
+ share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
+ household_id=[1, 2, 2, 3, 3, 3],
+ asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
@@ -596,7 +589,7 @@ def test_join_multi_levels(self):
"lu0197800237",
"nl0000289965",
],
- }
+ )
)
.set_index(["household_id", "asset_id"])
.reindex(columns=["male", "wealth", "name", "share"])
@@ -618,7 +611,7 @@ def test_join_multi_levels(self):
expected,
(
DataFrame(
- {"share": [1.00]},
+ dict(share=[1.00]),
index=MultiIndex.from_tuples(
[(4, np.nan)], names=["household_id", "asset_id"]
),
@@ -649,9 +642,9 @@ def test_join_multi_levels2(self):
# some more advanced merges
# GH6360
household = DataFrame(
- {
- "household_id": [1, 2, 2, 3, 3, 3, 4],
- "asset_id": [
+ dict(
+ household_id=[1, 2, 2, 3, 3, 3, 4],
+ asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
@@ -660,36 +653,30 @@ def test_join_multi_levels2(self):
"nl0000289965",
np.nan,
],
- "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
- },
+ share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
+ ),
columns=["household_id", "asset_id", "share"],
).set_index(["household_id", "asset_id"])
log_return = DataFrame(
- {
- "asset_id": [
+ dict(
+ asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
- "t": [233, 234, 235, 180, 181],
- "log_return": [
- 0.09604978,
- -0.06524096,
- 0.03532373,
- 0.03025441,
- 0.036997,
- ],
- }
+ t=[233, 234, 235, 180, 181],
+ log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997],
+ )
).set_index(["asset_id", "t"])
expected = (
DataFrame(
- {
- "household_id": [2, 2, 2, 3, 3, 3, 3, 3],
- "asset_id": [
+ dict(
+ household_id=[2, 2, 2, 3, 3, 3, 3, 3],
+ asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
@@ -699,9 +686,9 @@ def test_join_multi_levels2(self):
"lu0197800237",
"lu0197800237",
],
- "t": [233, 234, 235, 233, 234, 235, 180, 181],
- "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
- "log_return": [
+ t=[233, 234, 235, 233, 234, 235, 180, 181],
+ share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
+ log_return=[
0.09604978,
-0.06524096,
0.03532373,
@@ -711,7 +698,7 @@ def test_join_multi_levels2(self):
0.03025441,
0.036997,
],
- }
+ )
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
@@ -728,9 +715,9 @@ def test_join_multi_levels2(self):
expected = (
DataFrame(
- {
- "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
- "asset_id": [
+ dict(
+ household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
+ asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
@@ -744,21 +731,8 @@ def test_join_multi_levels2(self):
"nl0000289965",
None,
],
- "t": [
- None,
- None,
- 233,
- 234,
- 235,
- 233,
- 234,
- 235,
- 180,
- 181,
- None,
- None,
- ],
- "share": [
+ t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
+ share=[
1.0,
0.4,
0.6,
@@ -772,7 +746,7 @@ def test_join_multi_levels2(self):
0.25,
1.0,
],
- "log_return": [
+ log_return=[
None,
None,
0.09604978,
@@ -786,7 +760,7 @@ def test_join_multi_levels2(self):
None,
None,
],
- }
+ )
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py
index 6faf64789c687..5f6037276b31c 100644
--- a/pandas/tests/reshape/test_crosstab.py
+++ b/pandas/tests/reshape/test_crosstab.py
@@ -535,32 +535,15 @@ def test_crosstab_with_numpy_size(self):
)
tm.assert_frame_equal(result, expected)
- def test_crosstab_duplicate_names(self):
- # GH 13279 / 22529
-
- s1 = Series(range(3), name="foo")
- s2_foo = Series(range(1, 4), name="foo")
- s2_bar = Series(range(1, 4), name="bar")
- s3 = Series(range(3), name="waldo")
-
- # check result computed with duplicate labels against
- # result computed with unique labels, then relabelled
- mapper = {"bar": "foo"}
-
- # duplicate row, column labels
- result = crosstab(s1, s2_foo)
- expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
- tm.assert_frame_equal(result, expected)
-
- # duplicate row, unique column labels
- result = crosstab([s1, s2_foo], s3)
- expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
- tm.assert_frame_equal(result, expected)
-
- # unique row, duplicate column labels
- result = crosstab(s3, [s1, s2_foo])
- expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
+ def test_crosstab_dup_index_names(self):
+ # GH 13279
+ s = Series(range(3), name="foo")
+ result = crosstab(s, s)
+ expected_index = Index(range(3), name="foo")
+ expected = DataFrame(
+ np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index
+ )
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
index 4786b8c35a5b1..8aa4012b3e77c 100644
--- a/pandas/tests/reshape/test_cut.py
+++ b/pandas/tests/reshape/test_cut.py
@@ -377,10 +377,10 @@ def test_series_ret_bins():
@pytest.mark.parametrize(
"kwargs,msg",
[
- ({"duplicates": "drop"}, None),
- ({}, "Bin edges must be unique"),
- ({"duplicates": "raise"}, "Bin edges must be unique"),
- ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
+ (dict(duplicates="drop"), None),
+ (dict(), "Bin edges must be unique"),
+ (dict(duplicates="raise"), "Bin edges must be unique"),
+ (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
],
)
def test_cut_duplicates_bin(kwargs, msg):
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
index e7a04bafed8e3..c436ab5d90578 100644
--- a/pandas/tests/reshape/test_qcut.py
+++ b/pandas/tests/reshape/test_qcut.py
@@ -166,10 +166,10 @@ def test_qcut_list_like_labels(labels, expected):
@pytest.mark.parametrize(
"kwargs,msg",
[
- ({"duplicates": "drop"}, None),
- ({}, "Bin edges must be unique"),
- ({"duplicates": "raise"}, "Bin edges must be unique"),
- ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
+ (dict(duplicates="drop"), None),
+ (dict(), "Bin edges must be unique"),
+ (dict(duplicates="raise"), "Bin edges must be unique"),
+ (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
],
)
def test_qcut_duplicates_bin(kwargs, msg):
diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py
index 9b87e32510b41..bce42f8c6caf0 100644
--- a/pandas/tests/scalar/period/test_period.py
+++ b/pandas/tests/scalar/period/test_period.py
@@ -487,22 +487,6 @@ def test_period_cons_combined(self):
with pytest.raises(ValueError, match=msg):
Period("2011-01", freq="1D1W")
- @pytest.mark.parametrize("day", ["1970/01/01 ", "2020-12-31 ", "1981/09/13 "])
- @pytest.mark.parametrize("hour", ["00:00:00", "00:00:01", "23:59:59", "12:00:59"])
- @pytest.mark.parametrize(
- "sec_float, expected",
- [
- (".000000001", 1),
- (".000000999", 999),
- (".123456789", 789),
- (".999999999", 999),
- ],
- )
- def test_period_constructor_nanosecond(self, day, hour, sec_float, expected):
- # GH 34621
-
- assert Period(day + hour + sec_float).start_time.nanosecond == expected
-
@pytest.mark.parametrize("hour", range(24))
def test_period_large_ordinal(self, hour):
# Issue #36430
diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py
index a15ef11f9c292..71ddf72562f36 100644
--- a/pandas/tests/series/indexing/test_datetime.py
+++ b/pandas/tests/series/indexing/test_datetime.py
@@ -561,7 +561,7 @@ def test_indexing():
expected = ts["2001"]
expected.name = "A"
- df = DataFrame({"A": ts})
+ df = DataFrame(dict(A=ts))
with tm.assert_produces_warning(FutureWarning):
# GH#36179 string indexing on rows for DataFrame deprecated
result = df["2001"]["A"]
diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py
index b4c30cb6d4cd2..3686337141420 100644
--- a/pandas/tests/series/indexing/test_getitem.py
+++ b/pandas/tests/series/indexing/test_getitem.py
@@ -389,22 +389,10 @@ def test_getitem_generator(string_series):
tm.assert_series_equal(result2, expected)
-@pytest.mark.parametrize(
- "series",
- [
- Series([0, 1]),
- Series(date_range("2012-01-01", periods=2)),
- Series(date_range("2012-01-01", periods=2, tz="CET")),
- ],
-)
-def test_getitem_ndim_deprecated(series):
- with tm.assert_produces_warning(
- FutureWarning, match="Support for multi-dimensional indexing"
- ):
- result = series[:, None]
-
- expected = np.asarray(series)[:, None]
- tm.assert_numpy_array_equal(result, expected)
+def test_getitem_ndim_deprecated():
+ s = Series([0, 1])
+ with tm.assert_produces_warning(FutureWarning):
+ s[:, None]
def test_getitem_multilevel_scalar_slice_not_implemented(
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
index 159b42621f970..682c057f05700 100644
--- a/pandas/tests/series/indexing/test_indexing.py
+++ b/pandas/tests/series/indexing/test_indexing.py
@@ -309,7 +309,8 @@ def test_loc_setitem_2d_to_1d_raises():
msg = "|".join(
[
- r"shape mismatch: value array of shape \(2,2\)",
+ r"shape mismatch: value array of shape \(2,2\) could not be "
+ r"broadcast to indexing result of shape \(2,\)",
r"cannot reshape array of size 4 into shape \(2,\)",
]
)
@@ -666,9 +667,7 @@ def test_underlying_data_conversion():
df
df["val"].update(s)
- expected = DataFrame(
- {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]}
- )
+ expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0]))
return_value = expected.set_index(["a", "b", "c"], inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)
@@ -691,11 +690,11 @@ def test_underlying_data_conversion():
pd.set_option("chained_assignment", "raise")
# GH 3217
- df = DataFrame({"a": [1, 3], "b": [np.nan, 2]})
+ df = DataFrame(dict(a=[1, 3], b=[np.nan, 2]))
df["c"] = np.nan
df["c"].update(Series(["foo"], index=[0]))
- expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]})
+ expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan]))
tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/series/indexing/test_xs.py b/pandas/tests/series/indexing/test_xs.py
index 83cc6d4670423..ca7ed50ab8875 100644
--- a/pandas/tests/series/indexing/test_xs.py
+++ b/pandas/tests/series/indexing/test_xs.py
@@ -56,8 +56,8 @@ def test_series_xs_droplevel_false(self):
mi = MultiIndex.from_tuples(
[("a", "x"), ("a", "y"), ("b", "x")], names=["level1", "level2"]
)
- ser = Series([1, 1, 1], index=mi)
- result = ser.xs("a", axis=0, drop_level=False)
+ df = Series([1, 1, 1], index=mi)
+ result = df.xs("a", axis=0, drop_level=False)
expected = Series(
[1, 1],
index=MultiIndex.from_tuples(
diff --git a/pandas/tests/series/methods/test_convert.py b/pandas/tests/series/methods/test_convert.py
index f052f4423d32a..b213e4a6c4c8a 100644
--- a/pandas/tests/series/methods/test_convert.py
+++ b/pandas/tests/series/methods/test_convert.py
@@ -3,23 +3,45 @@
import numpy as np
import pytest
-from pandas import Series, Timestamp
+from pandas import NaT, Series, Timestamp
import pandas._testing as tm
class TestConvert:
def test_convert(self):
# GH#10265
+ # Tests: All to nans, coerce, true
+ # Test coercion returns correct type
+ ser = Series(["a", "b", "c"])
+ results = ser._convert(datetime=True, coerce=True)
+ expected = Series([NaT] * 3)
+ tm.assert_series_equal(results, expected)
+
+ results = ser._convert(numeric=True, coerce=True)
+ expected = Series([np.nan] * 3)
+ tm.assert_series_equal(results, expected)
+
+ expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]"))
+ results = ser._convert(timedelta=True, coerce=True)
+ tm.assert_series_equal(results, expected)
+
dt = datetime(2001, 1, 1, 0, 0)
td = dt - datetime(2000, 1, 1, 0, 0)
# Test coercion with mixed types
ser = Series(["a", "3.1415", dt, td])
+ results = ser._convert(datetime=True, coerce=True)
+ expected = Series([NaT, NaT, dt, NaT])
+ tm.assert_series_equal(results, expected)
- results = ser._convert(numeric=True)
+ results = ser._convert(numeric=True, coerce=True)
expected = Series([np.nan, 3.1415, np.nan, np.nan])
tm.assert_series_equal(results, expected)
+ results = ser._convert(timedelta=True, coerce=True)
+ expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]"))
+ tm.assert_series_equal(results, expected)
+
# Test standard conversion returns original
results = ser._convert(datetime=True)
tm.assert_series_equal(results, ser)
@@ -94,6 +116,19 @@ def test_convert(self):
datetime(2001, 1, 3, 0, 0),
]
)
+ s2 = Series(
+ [
+ datetime(2001, 1, 1, 0, 0),
+ datetime(2001, 1, 2, 0, 0),
+ datetime(2001, 1, 3, 0, 0),
+ "foo",
+ 1.0,
+ 1,
+ Timestamp("20010104"),
+ "20010105",
+ ],
+ dtype="O",
+ )
result = ser._convert(datetime=True)
expected = Series(
@@ -102,12 +137,35 @@ def test_convert(self):
)
tm.assert_series_equal(result, expected)
- result = ser._convert(datetime=True)
+ result = ser._convert(datetime=True, coerce=True)
+ tm.assert_series_equal(result, expected)
+
+ expected = Series(
+ [
+ Timestamp("20010101"),
+ Timestamp("20010102"),
+ Timestamp("20010103"),
+ NaT,
+ NaT,
+ NaT,
+ Timestamp("20010104"),
+ Timestamp("20010105"),
+ ],
+ dtype="M8[ns]",
+ )
+ result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True)
+ tm.assert_series_equal(result, expected)
+ result = s2._convert(datetime=True, coerce=True)
+ tm.assert_series_equal(result, expected)
+
+ ser = Series(["foo", "bar", 1, 1.0], dtype="O")
+ result = ser._convert(datetime=True, coerce=True)
+ expected = Series([NaT] * 2 + [Timestamp(1)] * 2)
tm.assert_series_equal(result, expected)
# preserver if non-object
ser = Series([1], dtype="float32")
- result = ser._convert(datetime=True)
+ result = ser._convert(datetime=True, coerce=True)
tm.assert_series_equal(result, ser)
# FIXME: dont leave commented-out
@@ -116,6 +174,16 @@ def test_convert(self):
# result = res._convert(convert_dates=True,convert_numeric=False)
# assert result.dtype == 'M8[ns]'
+ # dateutil parses some single letters into today's value as a date
+ expected = Series([NaT])
+ for x in "abcdefghijklmnopqrstuvwxyz":
+ ser = Series([x])
+ result = ser._convert(datetime=True, coerce=True)
+ tm.assert_series_equal(result, expected)
+ ser = Series([x.upper()])
+ result = ser._convert(datetime=True, coerce=True)
+ tm.assert_series_equal(result, expected)
+
def test_convert_no_arg_error(self):
ser = Series(["1.0", "2"])
msg = r"At least one of datetime, numeric or timedelta must be True\."
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index 920182a99e9ef..8a915324a72c1 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -8,186 +8,272 @@
import pandas as pd
import pandas._testing as tm
-# Each test case consists of a tuple with the data and dtype to create the
-# test Series, the default dtype for the expected result (which is valid
-# for most cases), and the specific cases where the result deviates from
-# this default. Those overrides are defined as a dict with (keyword, val) as
-# dictionary key. In case of multiple items, the last override takes precendence.
-test_cases = [
- (
- # data
- [1, 2, 3],
- # original dtype
- np.dtype("int32"),
- # default expected dtype
- "Int32",
- # exceptions on expected dtype
- {("convert_integer", False): np.dtype("int32")},
- ),
- (
- [1, 2, 3],
- np.dtype("int64"),
- "Int64",
- {("convert_integer", False): np.dtype("int64")},
- ),
- (
- ["x", "y", "z"],
- np.dtype("O"),
- pd.StringDtype(),
- {("convert_string", False): np.dtype("O")},
- ),
- (
- [True, False, np.nan],
- np.dtype("O"),
- pd.BooleanDtype(),
- {("convert_boolean", False): np.dtype("O")},
- ),
- (
- ["h", "i", np.nan],
- np.dtype("O"),
- pd.StringDtype(),
- {("convert_string", False): np.dtype("O")},
- ),
- ( # GH32117
- ["h", "i", 1],
- np.dtype("O"),
- np.dtype("O"),
- {},
- ),
- (
- [10, np.nan, 20],
- np.dtype("float"),
- "Int64",
- {
- ("convert_integer", False, "convert_floating", True): "Float64",
- ("convert_integer", False, "convert_floating", False): np.dtype("float"),
- },
- ),
- (
- [np.nan, 100.5, 200],
- np.dtype("float"),
- "Float64",
- {("convert_floating", False): np.dtype("float")},
- ),
- (
- [3, 4, 5],
- "Int8",
- "Int8",
- {},
- ),
- (
- [[1, 2], [3, 4], [5]],
- None,
- np.dtype("O"),
- {},
- ),
- (
- [4, 5, 6],
- np.dtype("uint32"),
- "UInt32",
- {("convert_integer", False): np.dtype("uint32")},
- ),
- (
- [-10, 12, 13],
- np.dtype("i1"),
- "Int8",
- {("convert_integer", False): np.dtype("i1")},
- ),
- (
- [1.2, 1.3],
- np.dtype("float32"),
- "Float32",
- {("convert_floating", False): np.dtype("float32")},
- ),
- (
- [1, 2.0],
- object,
- "Int64",
- {
- ("convert_integer", False): "Float64",
- ("convert_integer", False, "convert_floating", False): np.dtype("float"),
- ("infer_objects", False): np.dtype("object"),
- },
- ),
- (
- [1, 2.5],
- object,
- "Float64",
- {
- ("convert_floating", False): np.dtype("float"),
- ("infer_objects", False): np.dtype("object"),
- },
- ),
- (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
- (
- pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
- pd.DatetimeTZDtype(tz="UTC"),
- pd.DatetimeTZDtype(tz="UTC"),
- {},
- ),
- (
- pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
- "datetime64[ns]",
- np.dtype("datetime64[ns]"),
- {},
- ),
- (
- pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
- object,
- np.dtype("datetime64[ns]"),
- {("infer_objects", False): np.dtype("object")},
- ),
- (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}),
- (
- pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
- None,
- pd.IntervalDtype("int64"),
- {},
- ),
-]
-
class TestSeriesConvertDtypes:
+ # The answerdict has keys that have 4 tuples, corresponding to the arguments
+ # infer_objects, convert_string, convert_integer, convert_boolean
+ # This allows all 16 possible combinations to be tested. Since common
+ # combinations expect the same answer, this provides an easy way to list
+ # all the possibilities
@pytest.mark.parametrize(
- "data, maindtype, expected_default, expected_other",
- test_cases,
+ "data, maindtype, answerdict",
+ [
+ (
+ [1, 2, 3],
+ np.dtype("int32"),
+ {
+ ((True, False), (True, False), (True,), (True, False)): "Int32",
+ ((True, False), (True, False), (False,), (True, False)): np.dtype(
+ "int32"
+ ),
+ },
+ ),
+ (
+ [1, 2, 3],
+ np.dtype("int64"),
+ {
+ ((True, False), (True, False), (True,), (True, False)): "Int64",
+ ((True, False), (True, False), (False,), (True, False)): np.dtype(
+ "int64"
+ ),
+ },
+ ),
+ (
+ ["x", "y", "z"],
+ np.dtype("O"),
+ {
+ (
+ (True, False),
+ (True,),
+ (True, False),
+ (True, False),
+ ): pd.StringDtype(),
+ ((True, False), (False,), (True, False), (True, False)): np.dtype(
+ "O"
+ ),
+ },
+ ),
+ (
+ [True, False, np.nan],
+ np.dtype("O"),
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True,),
+ ): pd.BooleanDtype(),
+ ((True, False), (True, False), (True, False), (False,)): np.dtype(
+ "O"
+ ),
+ },
+ ),
+ (
+ ["h", "i", np.nan],
+ np.dtype("O"),
+ {
+ (
+ (True, False),
+ (True,),
+ (True, False),
+ (True, False),
+ ): pd.StringDtype(),
+ ((True, False), (False,), (True, False), (True, False)): np.dtype(
+ "O"
+ ),
+ },
+ ),
+ ( # GH32117
+ ["h", "i", 1],
+ np.dtype("O"),
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): np.dtype("O"),
+ },
+ ),
+ (
+ [10, np.nan, 20],
+ np.dtype("float"),
+ {
+ ((True, False), (True, False), (True,), (True, False)): "Int64",
+ ((True, False), (True, False), (False,), (True, False)): np.dtype(
+ "float"
+ ),
+ },
+ ),
+ (
+ [np.nan, 100.5, 200],
+ np.dtype("float"),
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): np.dtype("float"),
+ },
+ ),
+ (
+ [3, 4, 5],
+ "Int8",
+ {((True, False), (True, False), (True, False), (True, False)): "Int8"},
+ ),
+ (
+ [[1, 2], [3, 4], [5]],
+ None,
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): np.dtype("O"),
+ },
+ ),
+ (
+ [4, 5, 6],
+ np.dtype("uint32"),
+ {
+ ((True, False), (True, False), (True,), (True, False)): "UInt32",
+ ((True, False), (True, False), (False,), (True, False)): np.dtype(
+ "uint32"
+ ),
+ },
+ ),
+ (
+ [-10, 12, 13],
+ np.dtype("i1"),
+ {
+ ((True, False), (True, False), (True,), (True, False)): "Int8",
+ ((True, False), (True, False), (False,), (True, False)): np.dtype(
+ "i1"
+ ),
+ },
+ ),
+ (
+ [1, 2.0],
+ object,
+ {
+ ((True,), (True, False), (True,), (True, False)): "Int64",
+ ((True,), (True, False), (False,), (True, False)): np.dtype(
+ "float"
+ ),
+ ((False,), (True, False), (True, False), (True, False)): np.dtype(
+ "object"
+ ),
+ },
+ ),
+ (
+ [1, 2.5],
+ object,
+ {
+ ((True,), (True, False), (True, False), (True, False)): np.dtype(
+ "float"
+ ),
+ ((False,), (True, False), (True, False), (True, False)): np.dtype(
+ "object"
+ ),
+ },
+ ),
+ (
+ ["a", "b"],
+ pd.CategoricalDtype(),
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): pd.CategoricalDtype(),
+ },
+ ),
+ (
+ pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
+ pd.DatetimeTZDtype(tz="UTC"),
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): pd.DatetimeTZDtype(tz="UTC"),
+ },
+ ),
+ (
+ pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
+ "datetime64[ns]",
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): np.dtype("datetime64[ns]"),
+ },
+ ),
+ (
+ pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
+ object,
+ {
+ ((True,), (True, False), (True, False), (True, False)): np.dtype(
+ "datetime64[ns]"
+ ),
+ ((False,), (True, False), (True, False), (True, False)): np.dtype(
+ "O"
+ ),
+ },
+ ),
+ (
+ pd.period_range("1/1/2011", freq="M", periods=3),
+ None,
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): pd.PeriodDtype("M"),
+ },
+ ),
+ (
+ pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]),
+ None,
+ {
+ (
+ (True, False),
+ (True, False),
+ (True, False),
+ (True, False),
+ ): pd.IntervalDtype("int64"),
+ },
+ ),
+ ],
)
- @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
- def test_convert_dtypes(
- self, data, maindtype, params, expected_default, expected_other
- ):
+ @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
+ def test_convert_dtypes(self, data, maindtype, params, answerdict):
if maindtype is not None:
series = pd.Series(data, dtype=maindtype)
else:
series = pd.Series(data)
+ answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)}
- result = series.convert_dtypes(*params)
-
- param_names = [
- "infer_objects",
- "convert_string",
- "convert_integer",
- "convert_boolean",
- "convert_floating",
- ]
- params_dict = dict(zip(param_names, params))
-
- expected_dtype = expected_default
- for spec, dtype in expected_other.items():
- if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
- expected_dtype = dtype
-
- expected = pd.Series(data, dtype=expected_dtype)
- tm.assert_series_equal(result, expected)
+ ns = series.convert_dtypes(*params)
+ expected_dtype = answers[tuple(params)]
+ expected = pd.Series(series.values, dtype=expected_dtype)
+ tm.assert_series_equal(ns, expected)
# Test that it is a copy
copy = series.copy(deep=True)
- if is_interval_dtype(result.dtype) and result.dtype.subtype.kind in ["i", "u"]:
+ if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]:
msg = "Cannot set float NaN to integer-backed IntervalArray"
with pytest.raises(ValueError, match=msg):
- result[result.notna()] = np.nan
+ ns[ns.notna()] = np.nan
else:
- result[result.notna()] = np.nan
+ ns[ns.notna()] = np.nan
# Make sure original not changed
tm.assert_series_equal(series, copy)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
index 8740a309eec13..1b05f72f5cf4d 100644
--- a/pandas/tests/series/methods/test_interpolate.py
+++ b/pandas/tests/series/methods/test_interpolate.py
@@ -37,7 +37,7 @@ def nontemporal_method(request):
separately from these non-temporal methods.
"""
method = request.param
- kwargs = {"order": 1} if method in ("spline", "polynomial") else {}
+ kwargs = dict(order=1) if method in ("spline", "polynomial") else dict()
return method, kwargs
@@ -67,7 +67,7 @@ def interp_methods_ind(request):
'values' as a parameterization
"""
method = request.param
- kwargs = {"order": 1} if method in ("spline", "polynomial") else {}
+ kwargs = dict(order=1) if method in ("spline", "polynomial") else dict()
return method, kwargs
@@ -458,82 +458,6 @@ def test_interp_limit_direction_raises(self, method, limit_direction, expected):
with pytest.raises(ValueError, match=msg):
s.interpolate(method=method, limit_direction=limit_direction)
- @pytest.mark.parametrize(
- "data, expected_data, kwargs",
- (
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
- {"method": "pad", "limit_area": "inside"},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
- {"method": "pad", "limit_area": "inside", "limit": 1},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
- {"method": "pad", "limit_area": "outside"},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
- {"method": "pad", "limit_area": "outside", "limit": 1},
- ),
- (
- [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
- [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
- {"method": "pad", "limit_area": "outside", "limit": 1},
- ),
- (
- range(5),
- range(5),
- {"method": "pad", "limit_area": "outside", "limit": 1},
- ),
- ),
- )
- def test_interp_limit_area_with_pad(self, data, expected_data, kwargs):
- # GH26796
-
- s = Series(data)
- expected = Series(expected_data)
- result = s.interpolate(**kwargs)
- tm.assert_series_equal(result, expected)
-
- @pytest.mark.parametrize(
- "data, expected_data, kwargs",
- (
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
- {"method": "bfill", "limit_area": "inside"},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
- {"method": "bfill", "limit_area": "inside", "limit": 1},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
- {"method": "bfill", "limit_area": "outside"},
- ),
- (
- [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
- [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
- {"method": "bfill", "limit_area": "outside", "limit": 1},
- ),
- ),
- )
- def test_interp_limit_area_with_backfill(self, data, expected_data, kwargs):
- # GH26796
-
- s = Series(data)
- expected = Series(expected_data)
- result = s.interpolate(**kwargs)
- tm.assert_series_equal(result, expected)
-
def test_interp_limit_direction(self):
# These tests are for issue #9218 -- fill NaNs in both directions.
s = Series([1, 3, np.nan, np.nan, np.nan, 11])
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
index b204d92b9122f..c5196cea5d3bb 100644
--- a/pandas/tests/series/test_arithmetic.py
+++ b/pandas/tests/series/test_arithmetic.py
@@ -756,15 +756,12 @@ def test_align_date_objects_with_datetimeindex(self):
)
@pytest.mark.parametrize("box", [list, tuple, np.array, pd.Index, pd.Series, pd.array])
@pytest.mark.parametrize("flex", [True, False])
-def test_series_ops_name_retention(flex, box, names, all_binary_operators, request):
+def test_series_ops_name_retention(flex, box, names, all_binary_operators):
# GH#33930 consistent name retention
op = all_binary_operators
- if op is ops.rfloordiv and box in [list, tuple] and not flex:
- mark = pytest.mark.xfail(
- reason="op fails because of inconsistent ndarray-wrapping GH#28759"
- )
- request.node.add_marker(mark)
+ if op is ops.rfloordiv and box in [list, tuple]:
+ pytest.xfail("op fails because of inconsistent ndarray-wrapping GH#28759")
left = Series(range(10), name=names[0])
right = Series(range(10), name=names[1])
@@ -841,8 +838,14 @@ class TestInplaceOperations:
(
("Int64", "Int64", "Int64", "Int64"),
("float", "float", "float", "float"),
- ("Int64", "float", "Float64", "Float64"),
- ("Int64", "Float64", "Float64", "Float64"),
+ ("Int64", "float", "float", "float"),
+ pytest.param(
+ "Int64",
+ "Float64",
+ "Float64",
+ "Float64",
+ marks=pytest.mark.xfail(reason="Not implemented yet"),
+ ),
),
)
def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 35411d7e9cfb7..d836ca7a53249 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1044,6 +1044,7 @@ def test_different_nans_as_float64(self):
expected = np.array([True, True])
tm.assert_numpy_array_equal(result, expected)
+ @pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_int_df_string_search(self):
"""Comparing df with int`s (1,2) with a string at isin() ("1")
-> should not match values because int 1 is not equal str 1"""
@@ -1052,6 +1053,7 @@ def test_isin_int_df_string_search(self):
expected_false = DataFrame({"values": [False, False]})
tm.assert_frame_equal(result, expected_false)
+ @pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_nan_df_string_search(self):
"""Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
-> should not match values because np.nan is not equal str NaN"""
@@ -1060,6 +1062,7 @@ def test_isin_nan_df_string_search(self):
expected_false = DataFrame({"values": [False, False]})
tm.assert_frame_equal(result, expected_false)
+ @pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_float_df_string_search(self):
"""Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
-> should not match values because float 1.4245 is not equal str 1.4245"""
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
index f89958f7723ef..713607d087bc0 100644
--- a/pandas/tests/tools/test_to_numeric.py
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -48,8 +48,8 @@ def transform_assert_equal(request):
@pytest.mark.parametrize(
"input_kwargs,result_kwargs",
[
- ({}, {"dtype": np.int64}),
- ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
+ (dict(), dict(dtype=np.int64)),
+ (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)),
],
)
def test_empty(input_kwargs, result_kwargs):
@@ -147,10 +147,10 @@ def test_list():
@pytest.mark.parametrize(
"data,arr_kwargs",
[
- ([1, 3, 4, 5], {"dtype": np.int64}),
- ([1.0, 3.0, 4.0, 5.0], {}),
+ ([1, 3, 4, 5], dict(dtype=np.int64)),
+ ([1.0, 3.0, 4.0, 5.0], dict()),
# Boolean is regarded as numeric.
- ([True, False, True, True], {}),
+ ([True, False, True, True], dict()),
],
)
def test_list_numeric(data, arr_kwargs):
@@ -159,7 +159,7 @@ def test_list_numeric(data, arr_kwargs):
tm.assert_numpy_array_equal(result, expected)
-@pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
+@pytest.mark.parametrize("kwargs", [dict(dtype="O"), dict()])
def test_numeric(kwargs):
data = [1, -3.14, 7]
@@ -182,13 +182,13 @@ def test_numeric(kwargs):
def test_numeric_df_columns(columns):
# see gh-14827
df = DataFrame(
- {
- "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
- "b": [1.0, 2.0, 3.0, 4.0],
- }
+ dict(
+ a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
+ b=[1.0, 2.0, 3.0, 4.0],
+ )
)
- expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})
+ expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0]))
df_copy = df.copy()
df_copy[columns] = df_copy[columns].apply(to_numeric)
@@ -208,10 +208,10 @@ def test_numeric_df_columns(columns):
)
def test_numeric_embedded_arr_likes(data, exp_data):
# Test to_numeric with embedded lists and arrays
- df = DataFrame({"a": data})
+ df = DataFrame(dict(a=data))
df["a"] = df["a"].apply(to_numeric)
- expected = DataFrame({"a": exp_data})
+ expected = DataFrame(dict(a=exp_data))
tm.assert_frame_equal(df, expected)
@@ -226,7 +226,7 @@ def test_all_nan():
def test_type_check(errors):
# see gh-11776
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
- kwargs = {"errors": errors} if errors is not None else {}
+ kwargs = dict(errors=errors) if errors is not None else dict()
error_ctx = pytest.raises(TypeError, match="1-d array")
with error_ctx:
@@ -241,7 +241,7 @@ def test_scalar(val, signed, transform):
def test_really_large_scalar(large_val, signed, transform, errors):
# see gh-24910
- kwargs = {"errors": errors} if errors is not None else {}
+ kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val
val = transform(val)
@@ -258,7 +258,7 @@ def test_really_large_scalar(large_val, signed, transform, errors):
def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
# see gh-24910
- kwargs = {"errors": errors} if errors is not None else {}
+ kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val
val = transform(val)
@@ -300,7 +300,7 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors
#
# Even if we discover that we have to hold float, does not mean
# we should be lenient on subsequent elements that fail to be integer.
- kwargs = {"errors": errors} if errors is not None else {}
+ kwargs = dict(errors=errors) if errors is not None else dict()
arr = [str(-large_val if signed else large_val)]
if multiple_elts:
@@ -452,12 +452,12 @@ def test_errors_invalid_value():
"kwargs,exp_dtype",
[
# Basic function tests.
- ({}, np.int64),
- ({"downcast": None}, np.int64),
+ (dict(), np.int64),
+ (dict(downcast=None), np.int64),
# Support below np.float32 is rare and far between.
- ({"downcast": "float"}, np.dtype(np.float32).char),
+ (dict(downcast="float"), np.dtype(np.float32).char),
# Basic dtype support.
- ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
+ (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])),
],
)
def test_downcast_basic(data, kwargs, exp_dtype):
diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py
index 0fb1da777e357..a2c146dbd65e8 100644
--- a/pandas/tests/tseries/holiday/test_holiday.py
+++ b/pandas/tests/tseries/holiday/test_holiday.py
@@ -210,16 +210,16 @@ def test_argument_types(transform):
@pytest.mark.parametrize(
"name,kwargs",
[
- ("One-Time", {"year": 2012, "month": 5, "day": 28}),
+ ("One-Time", dict(year=2012, month=5, day=28)),
(
"Range",
- {
- "month": 5,
- "day": 28,
- "start_date": datetime(2012, 1, 1),
- "end_date": datetime(2012, 12, 31),
- "offset": DateOffset(weekday=MO(1)),
- },
+ dict(
+ month=5,
+ day=28,
+ start_date=datetime(2012, 1, 1),
+ end_date=datetime(2012, 12, 31),
+ offset=DateOffset(weekday=MO(1)),
+ ),
),
],
)
diff --git a/pandas/tests/tseries/holiday/test_observance.py b/pandas/tests/tseries/holiday/test_observance.py
index 83038ad254b77..9ee63d2a36556 100644
--- a/pandas/tests/tseries/holiday/test_observance.py
+++ b/pandas/tests/tseries/holiday/test_observance.py
@@ -22,7 +22,6 @@
_SUNDAY = datetime(2014, 4, 13)
_MONDAY = datetime(2014, 4, 14)
_TUESDAY = datetime(2014, 4, 15)
-_NEXT_WEDNESDAY = datetime(2014, 4, 16)
@pytest.mark.parametrize("day", [_SATURDAY, _SUNDAY])
@@ -61,15 +60,7 @@ def test_weekend_to_monday(day, expected):
@pytest.mark.parametrize(
- "day,expected",
- [
- (_WEDNESDAY, _THURSDAY),
- (_THURSDAY, _FRIDAY),
- (_SATURDAY, _MONDAY),
- (_SUNDAY, _MONDAY),
- (_MONDAY, _TUESDAY),
- (_TUESDAY, _NEXT_WEDNESDAY), # WED is same week as TUE
- ],
+ "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _TUESDAY)]
)
def test_next_workday(day, expected):
assert next_workday(day) == expected
@@ -83,16 +74,7 @@ def test_previous_workday(day, expected):
@pytest.mark.parametrize(
- "day,expected",
- [
- (_THURSDAY, _WEDNESDAY),
- (_FRIDAY, _THURSDAY),
- (_SATURDAY, _THURSDAY),
- (_SUNDAY, _FRIDAY),
- (_MONDAY, _FRIDAY), # last week Friday
- (_TUESDAY, _MONDAY),
- (_NEXT_WEDNESDAY, _TUESDAY), # WED is same week as TUE
- ],
+ "day,expected", [(_SATURDAY, _THURSDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)]
)
def test_before_nearest_workday(day, expected):
assert before_nearest_workday(day) == expected
diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py
index 1ac98247780b7..fca1316493e85 100644
--- a/pandas/tests/tseries/offsets/test_offsets.py
+++ b/pandas/tests/tseries/offsets/test_offsets.py
@@ -4191,8 +4191,8 @@ class TestDST:
# test both basic names and dateutil timezones
timezone_utc_offsets = {
- "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5},
- "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8},
+ "US/Eastern": dict(utc_offset_daylight=-4, utc_offset_standard=-5),
+ "dateutil/US/Pacific": dict(utc_offset_daylight=-7, utc_offset_standard=-8),
}
valid_date_offsets_singular = [
"weekday",
diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py
index 5b1134ee85e2c..93e5e2c801c09 100644
--- a/pandas/tests/tslibs/test_to_offset.py
+++ b/pandas/tests/tslibs/test_to_offset.py
@@ -131,15 +131,15 @@ def test_to_offset_leading_plus(freqstr, expected):
@pytest.mark.parametrize(
"kwargs,expected",
[
- ({"days": 1, "seconds": 1}, offsets.Second(86401)),
- ({"days": -1, "seconds": 1}, offsets.Second(-86399)),
- ({"hours": 1, "minutes": 10}, offsets.Minute(70)),
- ({"hours": 1, "minutes": -10}, offsets.Minute(50)),
- ({"weeks": 1}, offsets.Day(7)),
- ({"hours": 1}, offsets.Hour(1)),
- ({"hours": 1}, to_offset("60min")),
- ({"microseconds": 1}, offsets.Micro(1)),
- ({"microseconds": 0}, offsets.Nano(0)),
+ (dict(days=1, seconds=1), offsets.Second(86401)),
+ (dict(days=-1, seconds=1), offsets.Second(-86399)),
+ (dict(hours=1, minutes=10), offsets.Minute(70)),
+ (dict(hours=1, minutes=-10), offsets.Minute(50)),
+ (dict(weeks=1), offsets.Day(7)),
+ (dict(hours=1), offsets.Hour(1)),
+ (dict(hours=1), to_offset("60min")),
+ (dict(microseconds=1), offsets.Micro(1)),
+ (dict(microseconds=0), offsets.Nano(0)),
],
)
def test_to_offset_pd_timedelta(kwargs, expected):
diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py
index 29a0805bceb98..8957e7a172666 100644
--- a/pandas/tests/util/test_assert_categorical_equal.py
+++ b/pandas/tests/util/test_assert_categorical_equal.py
@@ -16,7 +16,7 @@ def test_categorical_equal(c):
def test_categorical_equal_order_mismatch(check_category_order):
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1])
- kwargs = {"check_category_order": check_category_order}
+ kwargs = dict(check_category_order=check_category_order)
if check_category_order:
msg = """Categorical\\.categories are different
diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py
index 545f0dcbf695f..f9259beab5d13 100644
--- a/pandas/tests/util/test_assert_extension_array_equal.py
+++ b/pandas/tests/util/test_assert_extension_array_equal.py
@@ -9,9 +9,9 @@
@pytest.mark.parametrize(
"kwargs",
[
- {}, # Default is check_exact=False
- {"check_exact": False},
- {"check_exact": True},
+ dict(), # Default is check_exact=False
+ dict(check_exact=False),
+ dict(check_exact=True),
],
)
def test_assert_extension_array_equal_not_exact(kwargs):
@@ -55,7 +55,7 @@ def test_assert_extension_array_equal_less_precise(decimals):
def test_assert_extension_array_equal_dtype_mismatch(check_dtype):
end = 5
- kwargs = {"check_dtype": check_dtype}
+ kwargs = dict(check_dtype=check_dtype)
arr1 = SparseArray(np.arange(end, dtype="int64"))
arr2 = SparseArray(np.arange(end, dtype="int32"))
diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py
index 8034ace479a62..d5161ce37494b 100644
--- a/pandas/tests/util/test_assert_frame_equal.py
+++ b/pandas/tests/util/test_assert_frame_equal.py
@@ -120,7 +120,7 @@ def test_frame_equal_shape_mismatch(df1, df2, obj_fixture):
],
)
def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type):
- kwargs = {"check_index_type": check_index_type}
+ kwargs = dict(check_index_type=check_index_type)
if check_index_type:
with pytest.raises(AssertionError, match=msg):
@@ -134,7 +134,7 @@ def test_empty_dtypes(check_dtype):
df1 = DataFrame(columns=columns)
df2 = DataFrame(columns=columns)
- kwargs = {"check_dtype": check_dtype}
+ kwargs = dict(check_dtype=check_dtype)
df1["col1"] = df1["col1"].astype("int64")
if check_dtype:
@@ -272,20 +272,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype):
tm.assert_frame_equal(left, right, check_dtype=False)
-@pytest.mark.parametrize(
- "dtype",
- [
- ("timedelta64[ns]"),
- ("datetime64[ns, UTC]"),
- ("Period[D]"),
- ],
-)
-def test_assert_frame_equal_datetime_like_dtype_mismatch(dtype):
- df1 = DataFrame({"a": []}, dtype=dtype)
- df2 = DataFrame({"a": []})
- tm.assert_frame_equal(df1, df2, check_dtype=False)
-
-
def test_allows_duplicate_labels():
left = DataFrame()
right = DataFrame().set_flags(allows_duplicate_labels=False)
diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py
index 988a0e7b24379..21d5a456e20d0 100644
--- a/pandas/tests/util/test_assert_index_equal.py
+++ b/pandas/tests/util/test_assert_index_equal.py
@@ -85,7 +85,7 @@ def test_index_equal_values_close(check_exact):
def test_index_equal_values_less_close(check_exact, rtol):
idx1 = Index([1, 2, 3.0])
idx2 = Index([1, 2, 3.0001])
- kwargs = {"check_exact": check_exact, "rtol": rtol}
+ kwargs = dict(check_exact=check_exact, rtol=rtol)
if check_exact or rtol < 0.5e-3:
msg = """Index are different
@@ -103,7 +103,7 @@ def test_index_equal_values_less_close(check_exact, rtol):
def test_index_equal_values_too_far(check_exact, rtol):
idx1 = Index([1, 2, 3])
idx2 = Index([1, 2, 4])
- kwargs = {"check_exact": check_exact, "rtol": rtol}
+ kwargs = dict(check_exact=check_exact, rtol=rtol)
msg = """Index are different
@@ -140,7 +140,7 @@ def test_index_equal_value_oder_mismatch(check_exact, rtol, check_order):
def test_index_equal_level_values_mismatch(check_exact, rtol):
idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)])
idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)])
- kwargs = {"check_exact": check_exact, "rtol": rtol}
+ kwargs = dict(check_exact=check_exact, rtol=rtol)
msg = """MultiIndex level \\[1\\] are different
diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py
index 8cc4ade3d7e95..2e8699536c72a 100644
--- a/pandas/tests/util/test_assert_interval_array_equal.py
+++ b/pandas/tests/util/test_assert_interval_array_equal.py
@@ -7,9 +7,9 @@
@pytest.mark.parametrize(
"kwargs",
[
- {"start": 0, "periods": 4},
- {"start": 1, "periods": 5},
- {"start": 5, "end": 10, "closed": "left"},
+ dict(start=0, periods=4),
+ dict(start=1, periods=5),
+ dict(start=5, end=10, closed="left"),
],
)
def test_interval_array_equal(kwargs):
@@ -18,7 +18,7 @@ def test_interval_array_equal(kwargs):
def test_interval_array_equal_closed_mismatch():
- kwargs = {"start": 0, "periods": 5}
+ kwargs = dict(start=0, periods=5)
arr1 = interval_range(closed="left", **kwargs).values
arr2 = interval_range(closed="right", **kwargs).values
@@ -34,7 +34,7 @@ def test_interval_array_equal_closed_mismatch():
def test_interval_array_equal_periods_mismatch():
- kwargs = {"start": 0}
+ kwargs = dict(start=0)
arr1 = interval_range(periods=5, **kwargs).values
arr2 = interval_range(periods=6, **kwargs).values
@@ -50,7 +50,7 @@ def test_interval_array_equal_periods_mismatch():
def test_interval_array_equal_end_mismatch():
- kwargs = {"start": 0, "periods": 5}
+ kwargs = dict(start=0, periods=5)
arr1 = interval_range(end=10, **kwargs).values
arr2 = interval_range(end=20, **kwargs).values
@@ -66,7 +66,7 @@ def test_interval_array_equal_end_mismatch():
def test_interval_array_equal_start_mismatch():
- kwargs = {"periods": 4}
+ kwargs = dict(periods=4)
arr1 = interval_range(start=0, **kwargs).values
arr2 = interval_range(start=1, **kwargs).values
diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py
index ae4523014b01d..0f56fb0b93642 100644
--- a/pandas/tests/util/test_assert_series_equal.py
+++ b/pandas/tests/util/test_assert_series_equal.py
@@ -87,9 +87,9 @@ def test_series_not_equal_value_mismatch(data1, data2):
@pytest.mark.parametrize(
"kwargs",
[
- {"dtype": "float64"}, # dtype mismatch
- {"index": [1, 2, 4]}, # index mismatch
- {"name": "foo"}, # name mismatch
+ dict(dtype="float64"), # dtype mismatch
+ dict(index=[1, 2, 4]), # index mismatch
+ dict(name="foo"), # name mismatch
],
)
def test_series_not_equal_metadata_mismatch(kwargs):
@@ -140,7 +140,7 @@ def test_less_precise(data1, data2, dtype, decimals):
],
)
def test_series_equal_index_dtype(s1, s2, msg, check_index_type):
- kwargs = {"check_index_type": check_index_type}
+ kwargs = dict(check_index_type=check_index_type)
if check_index_type:
with pytest.raises(AssertionError, match=msg):
diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py
index 4ea3ebe5000ad..fe5fc3e21d960 100644
--- a/pandas/tests/util/test_show_versions.py
+++ b/pandas/tests/util/test_show_versions.py
@@ -39,7 +39,7 @@ def test_show_versions(capsys):
assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result)
# check required dependency
- assert re.search(r"numpy\s*:\s([0-9\.\+a-f\_]|dev)+\n", result)
+ assert re.search(r"numpy\s*:\s([0-9\.\+a-f]|dev)+\n", result)
# check optional dependency
assert re.search(r"pyarrow\s*:\s([0-9\.]+|None)\n", result)
diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py
index db532480efe07..746d859b3322e 100644
--- a/pandas/tests/util/test_validate_args.py
+++ b/pandas/tests/util/test_validate_args.py
@@ -30,7 +30,7 @@ def test_bad_arg_length_max_value_single():
def test_bad_arg_length_max_value_multiple():
args = (None, None)
- compat_args = {"foo": None}
+ compat_args = dict(foo=None)
min_fname_arg_count = 2
max_length = len(compat_args) + min_fname_arg_count
@@ -61,7 +61,7 @@ def test_not_all_defaults(i):
def test_validation():
# No exceptions should be raised.
- validate_args(_fname, (None,), 2, {"out": None})
+ validate_args(_fname, (None,), 2, dict(out=None))
compat_args = {"axis": 1, "out": None}
validate_args(_fname, (1, None), 2, compat_args)
diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py
index c357affb6203d..8fe2a3712bf49 100644
--- a/pandas/tests/util/test_validate_kwargs.py
+++ b/pandas/tests/util/test_validate_kwargs.py
@@ -41,7 +41,7 @@ def test_validation():
# No exceptions should be raised.
compat_args = {"f": None, "b": 1, "ba": "s"}
- kwargs = {"f": None, "b": 1}
+ kwargs = dict(f=None, b=1)
validate_kwargs(_fname, kwargs, compat_args)
diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py
index 57665b47dea7f..aa3453680190b 100644
--- a/pandas/tests/window/moments/test_moments_consistency_ewm.py
+++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py
@@ -11,6 +11,7 @@ def test_ewm_pairwise_cov_corr(func, frame):
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5])
+ expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected, check_names=False)
diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py
index 53e5354340dcc..802ece77fd36d 100644
--- a/pandas/tests/window/moments/test_moments_consistency_rolling.py
+++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py
@@ -51,6 +51,7 @@ def test_rolling_pairwise_cov_corr(func, frame):
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5])
+ expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected, check_names=False)
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
index b89fb35ac3a70..f9b5a5fe9a3c1 100644
--- a/pandas/tests/window/test_groupby.py
+++ b/pandas/tests/window/test_groupby.py
@@ -1,15 +1,7 @@
import numpy as np
import pytest
-from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- date_range,
- to_datetime,
-)
+from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, to_datetime
import pandas._testing as tm
from pandas.api.indexers import BaseIndexer
from pandas.core.groupby.groupby import get_groupby
@@ -426,23 +418,12 @@ def test_groupby_rolling_empty_frame(self):
# GH 36197
expected = DataFrame({"s1": []})
result = expected.groupby("s1").rolling(window=1).sum()
- # GH-38057 from_tuples gives empty object dtype, we now get float/int levels
- # expected.index = MultiIndex.from_tuples([], names=["s1", None])
- expected.index = MultiIndex.from_product(
- [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
- )
+ expected.index = MultiIndex.from_tuples([], names=["s1", None])
tm.assert_frame_equal(result, expected)
expected = DataFrame({"s1": [], "s2": []})
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
- expected.index = MultiIndex.from_product(
- [
- Index([], dtype="float64"),
- Index([], dtype="float64"),
- Index([], dtype="int64"),
- ],
- names=["s1", "s2", None],
- )
+ expected.index = MultiIndex.from_tuples([], names=["s1", "s2", None])
tm.assert_frame_equal(result, expected)
def test_groupby_rolling_string_index(self):
@@ -586,60 +567,6 @@ def test_groupby_rolling_index_level_and_column_label(self):
)
tm.assert_frame_equal(result, expected)
- def test_groupby_rolling_resulting_multiindex(self):
- # a few different cases checking the created MultiIndex of the result
- # https://github.com/pandas-dev/pandas/pull/38057
-
- # grouping by 1 columns -> 2-level MI as result
- df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
- result = df.groupby("b").rolling(3).mean()
- expected_index = MultiIndex.from_tuples(
- [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
- names=["b", None],
- )
- tm.assert_index_equal(result.index, expected_index)
-
- # grouping by 2 columns -> 3-level MI as result
- df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
- result = df.groupby(["b", "c"]).rolling(2).sum()
- expected_index = MultiIndex.from_tuples(
- [
- (1, 1, 0),
- (1, 1, 4),
- (1, 1, 8),
- (1, 3, 2),
- (1, 3, 6),
- (1, 3, 10),
- (2, 2, 1),
- (2, 2, 5),
- (2, 2, 9),
- (2, 4, 3),
- (2, 4, 7),
- (2, 4, 11),
- ],
- names=["b", "c", None],
- )
- tm.assert_index_equal(result.index, expected_index)
-
- # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
- df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
- df = df.set_index("c", append=True)
- result = df.groupby("b").rolling(3).mean()
- expected_index = MultiIndex.from_tuples(
- [
- (1, 0, 1),
- (1, 2, 3),
- (1, 4, 1),
- (1, 6, 3),
- (2, 1, 2),
- (2, 3, 4),
- (2, 5, 2),
- (2, 7, 4),
- ],
- names=["b", None, "c"],
- )
- tm.assert_index_equal(result.index, expected_index)
-
class TestExpanding:
def setup_method(self):
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index 10b23cadfe279..1658cca347786 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -1085,15 +1085,8 @@ def test_groupby_rolling_nan_included():
result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
expected = DataFrame(
{"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
- # GH-38057 from_tuples puts the NaNs in the codes, result expects them
- # to be in the levels, at the moment
- # index=MultiIndex.from_tuples(
- # [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
- # names=["group", None],
- # ),
- index=MultiIndex(
- [["g1", "g2", np.nan], [0, 1, 2, 3, 4]],
- [[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]],
+ index=MultiIndex.from_tuples(
+ [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
names=["group", None],
),
)
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
index 5256cc29d5543..72003eeddf5ee 100644
--- a/pandas/util/_print_versions.py
+++ b/pandas/util/_print_versions.py
@@ -106,7 +106,7 @@ def show_versions(as_json: Union[str, bool] = False) -> None:
deps = _get_dependency_info()
if as_json:
- j = {"system": sys_info, "dependencies": deps}
+ j = dict(system=sys_info, dependencies=deps)
if as_json is True:
print(j)
From a0262ab0b4f5be9b452412c9862184fabed9ad9d Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 11 Dec 2020 12:53:05 +0800
Subject: [PATCH 26/42] Revert "fix doc"
This reverts commit b49229367fc3ab02e81c8c373d05c021560054f2.
---
doc/source/user_guide/timeseries.rst | 28 ++++++++++++--------------
doc/source/whatsnew/v1.2.0.rst | 30 +++++++++++++++++++---------
2 files changed, 34 insertions(+), 24 deletions(-)
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 843da644848b1..bee72ec70d95e 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -1888,39 +1888,31 @@ Those two examples are equivalent for this time series:
Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries.
-.. _timeseries.backward-resample:
-
Backward resample
~~~~~~~~~~~~~~~~~
.. versionadded:: 1.2.0
-``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq``, but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+``origin`` can not only make a foreward resample, namely grouping from the starting point with the given ``freq`` , but is also able to implement the backward resample. This method allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
.. ipython:: python
start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
- rng = pd.date_range(start, end, freq="7min")
- ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
-Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``.
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
ts.index.max()
ts.resample("17min", origin="end").sum()
-The forward resample output stands for the grouping result from current datetimeindex to the next one with ``closed=left`` by default. In contrast, the backward resample output stands for the grouping result from former datetimeindex to the current one with ``closed=right`` by default. If you want to change this, ``closed=left`` is available.
-
-.. ipython:: python
-
- ts.resample("17min", closed="left", origin="end").sum()
-
-Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``.
+Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
.. ipython:: python
- ts.resample("17min", origin="end_day").sum()
+ ts.resample("17min", origin="end").sum()
-If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set.
+If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
.. ipython:: python
@@ -1934,6 +1926,12 @@ You can implement ``offset='end_day'`` in the following method equivalently.
end_day_origin
ts.resample("17min", origin=end_day_origin, backward=True).sum()
+By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
+
+.. ipython:: python
+
+ ts.resample("17min", closed="left", origin="end").sum()
+
.. _timeseries.periods:
Time span representation
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index d45813960d5c2..ac8132339d38c 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -206,35 +206,47 @@ level-by-level basis.
.. _whatsnew_120.backward_resample:
-Backward resample
+Backward resample
^^^^^^^^^^^^^^^^^
-:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward``. ``'end'`` and ``'end_day'`` are available in argument ``offset``. Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
+:class:`Grouper` and :meth:`DataFrame.resample` now support the argument ``backward`` . ``'end'`` and ``'end_day'`` are available in argument ``offset`` . Backward resample allows users to control bins of the grouping from the given origin with a backward direction. (:issue:`37804`)
.. ipython:: python
start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00"
- rng = pd.date_range(start, end, freq="7min")
- ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
+ rng = date_range(start, end, freq="7min")
+ ts = Series(np.arange(len(rng)) * 3, index=rng)
-Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True``.
+Setting ``offset='end'`` means using the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
ts.index.max()
ts.resample("17min", origin="end").sum()
-Setting ``offset='end_day'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True``.
+Setting ``offset='end'`` means using the ceiling midnight of the max ``Timestamp`` as the ``origin`` with ``backward=True`` .
.. ipython:: python
- ts.resample("17min", origin="end_day").sum()
+ ts.resample("17min", origin="end").sum()
-If you want to make the backward resample from a Timestamp-like ``origin``, ``backward=True`` should be set.
+If you want to make the backward resample from a Timestamp-like ``origin`` , ``backward=True`` should be set.
.. ipython:: python
ts.resample("17min", origin="2000-10-02 00:40:00", backward=True).sum()
-For details, see: :ref:`timeseries.backward-resample`.
+You can implement ``offset='end_day'`` in the following method equivalently.
+
+.. ipython:: python
+
+ end_day_origin = ts.index.max().ceil("D")
+ end_day_origin
+ ts.resample("17min", origin=end_day_origin, backward=True).sum()
+
+By defualt, backward resample uses ``closed=right`` while ``closed=left`` is also available.
+
+.. ipython:: python
+
+ ts.resample("17min", closed="left", origin="end").sum()
.. _whatsnew_120.groupby_ewm:
From b990c5f5043f0a063f457b8a9f03794c6ffa919d Mon Sep 17 00:00:00 2001
From: GYHHAHA <1801214626@qq.com>
Date: Fri, 11 Dec 2020 12:53:25 +0800
Subject: [PATCH 27/42] Revert "Merge branch 'master' into master"
This reverts commit 0cff41ecb37d5663e7eb6debf98e4b6eea9a4a54, reversing
changes made to 77fc4a3fc0acaf1783f7ed87b53da6688b0f6395.
---
.github/workflows/ci.yml | 4 +-
.gitignore | 1 -
.pre-commit-config.yaml | 2 +-
.travis.yml | 7 +-
Dockerfile | 2 +-
Makefile | 2 +-
README.md | 2 +-
asv_bench/benchmarks/algorithms.py | 12 -
asv_bench/benchmarks/categoricals.py | 43 -
asv_bench/benchmarks/groupby.py | 2 +-
asv_bench/benchmarks/hash_functions.py | 164 ---
asv_bench/benchmarks/join_merge.py | 6 -
asv_bench/benchmarks/reshape.py | 5 +-
asv_bench/benchmarks/rolling.py | 13 -
asv_bench/benchmarks/series_methods.py | 73 +-
azure-pipelines.yml | 2 +-
ci/azure/posix.yml | 5 -
ci/azure/windows.yml | 2 +-
ci/build39.sh | 12 +
ci/check_cache.sh | 27 +
ci/code_checks.sh | 2 +-
ci/deps/azure-38-locale.yaml | 2 +-
ci/deps/azure-39.yaml | 22 -
ci/deps/travis-37-locale.yaml | 2 +-
ci/run_tests.sh | 2 +-
ci/setup_env.sh | 13 +-
doc/source/development/contributing.rst | 53 +-
.../development/contributing_docstring.rst | 10 +-
doc/source/development/extending.rst | 2 +-
doc/source/development/index.rst | 1 -
doc/source/development/policies.rst | 2 +-
doc/source/development/test_writing.rst | 174 ---
doc/source/ecosystem.rst | 14 +-
doc/source/getting_started/install.rst | 2 +-
.../intro_tutorials/04_plotting.rst | 4 +-
doc/source/reference/index.rst | 1 +
doc/source/reference/panel.rst | 10 +
doc/source/reference/style.rst | 1 -
doc/source/reference/window.rst | 28 +-
doc/source/user_guide/10min.rst | 4 +-
doc/source/user_guide/basics.rst | 10 +-
doc/source/user_guide/computation.rst | 989 +++++++++++++++++-
doc/source/user_guide/cookbook.rst | 7 +-
doc/source/user_guide/dsintro.rst | 2 +-
doc/source/user_guide/enhancingperf.rst | 8 +-
doc/source/user_guide/groupby.rst | 15 +-
doc/source/user_guide/index.rst | 1 -
doc/source/user_guide/indexing.rst | 78 +-
doc/source/user_guide/integer_na.rst | 2 +-
doc/source/user_guide/io.rst | 9 +-
doc/source/user_guide/merging.rst | 9 +-
doc/source/user_guide/options.rst | 8 +-
doc/source/user_guide/sparse.rst | 2 +-
doc/source/user_guide/style.ipynb | 34 +-
doc/source/user_guide/timeseries.rst | 28 +-
doc/source/user_guide/window.rst | 593 -----------
doc/source/whatsnew/v0.12.0.rst | 6 +-
doc/source/whatsnew/v0.14.0.rst | 4 +-
doc/source/whatsnew/v0.15.0.rst | 6 +-
doc/source/whatsnew/v0.15.2.rst | 2 +-
doc/source/whatsnew/v0.16.1.rst | 4 +-
doc/source/whatsnew/v0.16.2.rst | 2 +-
doc/source/whatsnew/v0.18.0.rst | 4 +-
doc/source/whatsnew/v0.19.0.rst | 2 +-
doc/source/whatsnew/v0.20.0.rst | 12 +-
doc/source/whatsnew/v0.21.0.rst | 2 +-
doc/source/whatsnew/v0.24.0.rst | 4 +-
doc/source/whatsnew/v0.6.0.rst | 2 +-
doc/source/whatsnew/v0.6.1.rst | 4 +-
doc/source/whatsnew/v0.8.0.rst | 2 +-
doc/source/whatsnew/v1.0.0.rst | 4 +-
doc/source/whatsnew/v1.1.5.rst | 24 +-
doc/source/whatsnew/v1.2.0.rst | 427 +++-----
environment.yml | 3 -
pandas/__init__.py | 19 +-
pandas/_libs/groupby.pyx | 30 +-
pandas/_libs/hashtable.pxd | 56 -
pandas/_libs/hashtable.pyx | 44 +-
pandas/_libs/hashtable_class_helper.pxi.in | 98 +-
pandas/_libs/hashtable_func_helper.pxi.in | 18 +-
pandas/_libs/index_class_helper.pxi.in | 30 +-
pandas/_libs/interval.pyx | 3 +-
pandas/_libs/khash.pxd | 83 +-
.../_libs/khash_for_primitive_helper.pxi.in | 42 -
pandas/_libs/lib.pyx | 14 +-
pandas/_libs/reduction.pyx | 4 +-
pandas/_libs/src/klib/khash.h | 181 +---
pandas/_libs/src/klib/khash_python.h | 124 +--
pandas/_libs/src/parser/tokenizer.c | 81 +-
pandas/_libs/tslibs/offsets.pyx | 26 -
pandas/_libs/tslibs/timedeltas.pyx | 9 +-
pandas/_libs/tslibs/tzconversion.pyx | 6 +-
pandas/_libs/window/aggregations.pyx | 31 +-
pandas/_testing.py | 39 +-
pandas/_typing.py | 5 +
pandas/_version.py | 301 ++----
pandas/compat/_optional.py | 2 +-
pandas/conftest.py | 48 +-
pandas/core/algorithms.py | 107 +-
pandas/core/apply.py | 33 +-
pandas/core/arraylike.py | 144 +--
pandas/core/arrays/_mixins.py | 92 +-
pandas/core/arrays/base.py | 47 +-
pandas/core/arrays/categorical.py | 75 +-
pandas/core/arrays/datetimelike.py | 98 +-
pandas/core/arrays/datetimes.py | 12 +-
pandas/core/arrays/floating.py | 2 +-
pandas/core/arrays/integer.py | 2 +-
pandas/core/arrays/interval.py | 305 +++---
pandas/core/arrays/masked.py | 14 +-
pandas/core/arrays/numpy_.py | 2 +-
pandas/core/arrays/period.py | 1 -
pandas/core/arrays/sparse/array.py | 16 +-
pandas/core/arrays/string_.py | 4 +
pandas/core/arrays/string_arrow.py | 625 -----------
pandas/core/arrays/timedeltas.py | 4 +-
pandas/core/base.py | 21 +-
pandas/core/common.py | 45 +-
pandas/core/computation/align.py | 14 +-
pandas/core/computation/parsing.py | 8 +-
pandas/core/computation/pytables.py | 4 -
pandas/core/construction.py | 6 +-
pandas/core/dtypes/base.py | 5 +-
pandas/core/dtypes/cast.py | 10 +-
pandas/core/dtypes/common.py | 2 +-
pandas/core/dtypes/concat.py | 6 +-
pandas/core/dtypes/dtypes.py | 2 +-
pandas/core/dtypes/generic.py | 22 +-
pandas/core/frame.py | 204 ++--
pandas/core/generic.py | 209 ++--
pandas/core/groupby/base.py | 1 -
pandas/core/groupby/generic.py | 18 +-
pandas/core/groupby/groupby.py | 44 +-
pandas/core/groupby/ops.py | 5 +-
pandas/core/indexers.py | 2 +-
pandas/core/indexes/base.py | 142 +--
pandas/core/indexes/category.py | 123 ++-
pandas/core/indexes/datetimelike.py | 230 ++--
pandas/core/indexes/datetimes.py | 52 +-
pandas/core/indexes/extension.py | 64 +-
pandas/core/indexes/interval.py | 164 +--
pandas/core/indexes/multi.py | 272 ++---
pandas/core/indexes/numeric.py | 97 +-
pandas/core/indexes/period.py | 82 +-
pandas/core/indexes/range.py | 24 +-
pandas/core/indexes/timedeltas.py | 31 +-
pandas/core/indexing.py | 196 ++--
pandas/core/internals/blocks.py | 313 +++---
pandas/core/internals/concat.py | 6 +-
pandas/core/internals/construction.py | 5 +-
pandas/core/internals/managers.py | 59 +-
pandas/core/nanops.py | 2 +-
pandas/core/ops/array_ops.py | 8 +-
pandas/core/resample.py | 4 +-
pandas/core/reshape/concat.py | 28 +-
pandas/core/reshape/merge.py | 81 +-
pandas/core/reshape/pivot.py | 3 +-
pandas/core/reshape/reshape.py | 12 +-
pandas/core/series.py | 98 +-
pandas/core/shared_docs.py | 63 --
pandas/core/sorting.py | 23 +-
pandas/core/strings/accessor.py | 3 +-
pandas/core/tools/numeric.py | 6 +-
pandas/core/tools/timedeltas.py | 5 -
pandas/core/window/__init__.py | 5 +-
pandas/core/window/common.py | 4 -
pandas/core/window/ewm.py | 158 +--
pandas/core/window/indexers.py | 15 -
pandas/core/window/numba_.py | 89 --
pandas/core/window/rolling.py | 48 +-
pandas/io/common.py | 201 ++--
pandas/io/excel/_base.py | 150 ++-
pandas/io/excel/_odfreader.py | 4 +-
pandas/io/excel/_odswriter.py | 16 +-
pandas/io/excel/_openpyxl.py | 24 +-
pandas/io/excel/_pyxlsb.py | 2 +-
pandas/io/excel/_xlrd.py | 2 +-
pandas/io/excel/_xlsxwriter.py | 7 +-
pandas/io/excel/_xlwt.py | 17 +-
pandas/io/feather_format.py | 45 +-
pandas/io/formats/console.py | 2 +-
pandas/io/formats/csvs.py | 36 +-
pandas/io/formats/excel.py | 173 ++-
pandas/io/formats/format.py | 74 +-
pandas/io/formats/info.py | 489 ++++-----
pandas/io/formats/printing.py | 2 +-
pandas/io/formats/style.py | 90 +-
pandas/io/json/_json.py | 125 ++-
pandas/io/orc.py | 12 +-
pandas/io/parquet.py | 196 ++--
pandas/io/parsers.py | 252 ++---
pandas/io/pickle.py | 110 +-
pandas/io/pytables.py | 5 +-
pandas/io/sas/sas7bdat.py | 15 +-
pandas/io/sas/sas_xport.py | 20 +-
pandas/io/sas/sasreader.py | 17 +-
pandas/io/sql.py | 2 +-
pandas/io/stata.py | 256 +++--
pandas/plotting/_matplotlib/boxplot.py | 4 +-
pandas/plotting/_matplotlib/converter.py | 2 +-
pandas/plotting/_matplotlib/core.py | 31 +-
pandas/plotting/_matplotlib/tools.py | 10 +-
pandas/tests/arithmetic/conftest.py | 11 +-
pandas/tests/arithmetic/test_datetime64.py | 27 +-
pandas/tests/arithmetic/test_interval.py | 2 +-
pandas/tests/arithmetic/test_numeric.py | 102 +-
pandas/tests/arithmetic/test_period.py | 128 +--
pandas/tests/arithmetic/test_timedelta64.py | 2 +-
.../arrays/categorical/test_analytics.py | 6 +-
pandas/tests/arrays/categorical/test_api.py | 5 +-
.../arrays/categorical/test_constructors.py | 19 +-
.../tests/arrays/categorical/test_dtypes.py | 4 +-
pandas/tests/arrays/categorical/test_take.py | 2 +-
.../arrays/floating/test_construction.py | 2 +-
.../tests/arrays/integer/test_construction.py | 2 +-
pandas/tests/arrays/interval/test_astype.py | 23 -
pandas/tests/arrays/sparse/test_array.py | 2 +-
pandas/tests/arrays/sparse/test_dtype.py | 4 +-
pandas/tests/arrays/string_/test_string.py | 383 ++-----
.../tests/arrays/string_/test_string_arrow.py | 26 -
pandas/tests/arrays/test_datetimelike.py | 104 +-
pandas/tests/arrays/test_period.py | 3 +-
pandas/tests/base/test_conversion.py | 4 +-
pandas/tests/base/test_misc.py | 2 +-
pandas/tests/base/test_value_counts.py | 8 +-
pandas/tests/dtypes/test_generic.py | 1 +
pandas/tests/dtypes/test_inference.py | 19 +-
pandas/tests/extension/test_external_block.py | 2 +-
pandas/tests/extension/test_interval.py | 10 +-
pandas/tests/extension/test_sparse.py | 2 +-
pandas/tests/extension/test_string.py | 58 +-
pandas/tests/frame/apply/test_frame_apply.py | 60 +-
.../tests/frame/apply/test_frame_transform.py | 2 +
pandas/tests/frame/conftest.py | 5 -
pandas/tests/frame/indexing/test_getitem.py | 23 -
pandas/tests/frame/indexing/test_indexing.py | 23 +-
pandas/tests/frame/indexing/test_setitem.py | 34 +-
pandas/tests/frame/indexing/test_xs.py | 30 -
pandas/tests/frame/methods/test_describe.py | 2 +-
pandas/tests/frame/methods/test_dtypes.py | 18 +-
pandas/tests/frame/methods/test_fillna.py | 15 -
pandas/tests/frame/methods/test_reindex.py | 29 +-
pandas/tests/frame/methods/test_replace.py | 48 +-
.../tests/frame/methods/test_reset_index.py | 2 +-
.../tests/frame/methods/test_select_dtypes.py | 26 +-
pandas/tests/frame/methods/test_to_csv.py | 10 +-
pandas/tests/frame/methods/test_to_dict.py | 4 +-
pandas/tests/frame/test_alter_axes.py | 103 +-
pandas/tests/frame/test_constructors.py | 74 +-
pandas/tests/frame/test_logical_ops.py | 36 -
pandas/tests/frame/test_nonunique_indexes.py | 30 +-
pandas/tests/frame/test_reductions.py | 150 +--
pandas/tests/frame/test_repr_info.py | 8 -
pandas/tests/frame/test_stack_unstack.py | 28 +-
pandas/tests/frame/test_ufunc.py | 111 --
pandas/tests/generic/test_duplicate_labels.py | 8 +-
pandas/tests/generic/test_finalize.py | 31 +-
pandas/tests/generic/test_generic.py | 69 +-
pandas/tests/generic/test_logical_ops.py | 49 +
pandas/tests/generic/test_to_xarray.py | 88 +-
.../tests/groupby/aggregate/test_aggregate.py | 23 +-
pandas/tests/groupby/test_allowlist.py | 1 -
pandas/tests/groupby/test_categorical.py | 54 +-
pandas/tests/groupby/test_groupby.py | 33 +-
pandas/tests/groupby/test_missing.py | 10 -
pandas/tests/groupby/test_nth.py | 24 -
pandas/tests/groupby/test_timegrouper.py | 58 +-
.../tests/groupby/transform/test_transform.py | 20 +-
.../tests/indexes/base_class/test_formats.py | 134 ---
.../tests/indexes/base_class/test_setops.py | 110 +-
.../indexes/categorical/test_category.py | 251 +++--
.../tests/indexes/categorical/test_equals.py | 77 --
.../tests/indexes/categorical/test_formats.py | 26 +-
.../indexes/categorical/test_indexing.py | 54 +-
pandas/tests/indexes/categorical/test_map.py | 12 +-
.../tests/indexes/categorical/test_reindex.py | 2 +-
pandas/tests/indexes/common.py | 161 ++-
pandas/tests/indexes/conftest.py | 2 +-
pandas/tests/indexes/datetimelike.py | 35 +-
pandas/tests/indexes/datetimes/test_astype.py | 6 +-
.../indexes/datetimes/test_constructors.py | 24 +-
.../indexes/datetimes/test_date_range.py | 28 +-
.../tests/indexes/datetimes/test_datetime.py | 26 +-
.../tests/indexes/datetimes/test_indexing.py | 58 +-
pandas/tests/indexes/datetimes/test_misc.py | 44 +-
pandas/tests/indexes/datetimes/test_ops.py | 71 +-
.../indexes/datetimes/test_partial_slicing.py | 13 +-
pandas/tests/indexes/datetimes/test_setops.py | 24 +-
pandas/tests/indexes/datetimes/test_shift.py | 4 +-
.../tests/indexes/datetimes/test_timezones.py | 16 +-
pandas/tests/indexes/interval/test_astype.py | 16 +-
pandas/tests/indexes/interval/test_base.py | 54 +-
.../indexes/interval/test_constructors.py | 20 +-
pandas/tests/indexes/interval/test_equals.py | 33 -
.../tests/indexes/interval/test_interval.py | 23 +-
pandas/tests/indexes/interval/test_setops.py | 8 +-
.../tests/indexes/multi/test_constructors.py | 22 +-
pandas/tests/indexes/multi/test_drop.py | 29 -
pandas/tests/indexes/multi/test_indexing.py | 12 +-
pandas/tests/indexes/multi/test_sorting.py | 10 +-
pandas/tests/indexes/numeric/test_indexing.py | 19 +-
pandas/tests/indexes/numeric/test_setops.py | 139 ---
pandas/tests/indexes/period/test_astype.py | 12 +-
pandas/tests/indexes/period/test_indexing.py | 6 +-
pandas/tests/indexes/period/test_ops.py | 40 +-
.../indexes/period/test_partial_slicing.py | 30 +-
.../tests/indexes/ranges/test_constructors.py | 14 +-
pandas/tests/indexes/ranges/test_indexing.py | 2 +-
pandas/tests/indexes/ranges/test_range.py | 25 +
pandas/tests/indexes/ranges/test_setops.py | 25 +-
pandas/tests/indexes/test_any_index.py | 14 -
pandas/tests/indexes/test_base.py | 360 ++++++-
pandas/tests/indexes/test_common.py | 163 ++-
pandas/tests/indexes/test_datetimelike.py | 174 ---
pandas/tests/indexes/test_indexing.py | 54 +-
pandas/tests/indexes/test_numeric.py | 168 ++-
pandas/tests/indexes/test_setops.py | 294 +-----
.../tests/indexes/timedeltas/test_astype.py | 4 +-
.../indexes/timedeltas/test_constructors.py | 4 +-
.../tests/indexes/timedeltas/test_indexing.py | 2 +-
pandas/tests/indexes/timedeltas/test_ops.py | 43 +-
.../indexes/timedeltas/test_scalar_compat.py | 3 +-
.../tests/indexes/timedeltas/test_setops.py | 2 +-
.../indexes/timedeltas/test_timedelta.py | 6 +
pandas/tests/indexing/common.py | 2 +-
.../tests/indexing/interval/test_interval.py | 10 +-
.../indexing/interval/test_interval_new.py | 12 +-
pandas/tests/indexing/multiindex/test_loc.py | 72 --
.../tests/indexing/multiindex/test_partial.py | 48 +-
.../tests/indexing/multiindex/test_setitem.py | 11 +-
.../tests/indexing/multiindex/test_slice.py | 79 +-
pandas/tests/indexing/test_at.py | 29 +-
pandas/tests/indexing/test_categorical.py | 108 +-
.../indexing/test_chaining_and_caching.py | 18 +-
pandas/tests/indexing/test_coercion.py | 4 +-
pandas/tests/indexing/test_datetime.py | 47 +-
pandas/tests/indexing/test_floats.py | 121 +--
pandas/tests/indexing/test_iat.py | 15 +-
pandas/tests/indexing/test_iloc.py | 86 +-
pandas/tests/indexing/test_indexing.py | 143 +--
pandas/tests/indexing/test_loc.py | 143 +--
pandas/tests/indexing/test_partial.py | 10 +-
pandas/tests/indexing/test_scalar.py | 32 +-
pandas/tests/internals/test_internals.py | 31 +-
pandas/tests/io/conftest.py | 2 +-
pandas/tests/io/excel/test_writers.py | 17 +-
pandas/tests/io/excel/test_xlrd.py | 2 +-
.../data/html/various_dtypes_formatted.html | 36 -
pandas/tests/io/formats/test_format.py | 47 +-
pandas/tests/io/formats/test_info.py | 119 ++-
pandas/tests/io/formats/test_style.py | 22 -
pandas/tests/io/formats/test_to_csv.py | 15 +-
pandas/tests/io/formats/test_to_html.py | 15 -
pandas/tests/io/json/test_pandas.py | 14 +-
pandas/tests/io/parser/test_compression.py | 21 +-
pandas/tests/io/parser/test_read_fwf.py | 47 +-
pandas/tests/io/pytables/test_store.py | 20 +-
pandas/tests/io/pytables/test_timezones.py | 30 +-
pandas/tests/io/test_clipboard.py | 2 +-
pandas/tests/io/test_common.py | 19 +-
pandas/tests/io/test_compression.py | 23 +-
pandas/tests/io/test_feather.py | 2 +-
pandas/tests/io/test_fsspec.py | 65 +-
pandas/tests/io/test_gcs.py | 57 +-
pandas/tests/io/test_html.py | 4 +-
pandas/tests/io/test_parquet.py | 94 +-
pandas/tests/io/test_sql.py | 22 +-
pandas/tests/libs/test_hashtable.py | 336 ------
pandas/tests/plotting/frame/test_frame.py | 86 --
pandas/tests/plotting/test_converter.py | 23 +-
pandas/tests/plotting/test_datetimelike.py | 44 +-
pandas/tests/plotting/test_groupby.py | 4 +-
pandas/tests/plotting/test_series.py | 23 -
pandas/tests/reductions/test_reductions.py | 10 +-
pandas/tests/resample/test_datetime_index.py | 31 +-
pandas/tests/resample/test_period_index.py | 10 +-
pandas/tests/resample/test_resample_api.py | 4 +-
pandas/tests/reshape/concat/test_concat.py | 26 +-
pandas/tests/reshape/concat/test_dataframe.py | 11 -
pandas/tests/reshape/concat/test_series.py | 4 +-
pandas/tests/reshape/merge/test_join.py | 14 +-
pandas/tests/reshape/merge/test_merge.py | 20 +-
.../tests/reshape/merge/test_merge_cross.py | 95 --
pandas/tests/reshape/test_get_dummies.py | 2 +-
pandas/tests/reshape/test_pivot.py | 51 +-
pandas/tests/scalar/period/test_period.py | 4 +-
.../tests/scalar/timestamp/test_timestamp.py | 33 +-
.../series/accessors/test_cat_accessor.py | 5 +-
pandas/tests/series/indexing/test_datetime.py | 3 +-
pandas/tests/series/indexing/test_getitem.py | 22 -
pandas/tests/series/indexing/test_indexing.py | 4 +-
pandas/tests/series/indexing/test_setitem.py | 25 -
pandas/tests/series/indexing/test_xs.py | 15 -
pandas/tests/series/methods/test_isin.py | 55 -
pandas/tests/series/methods/test_replace.py | 30 +-
pandas/tests/series/methods/test_shift.py | 2 +-
pandas/tests/series/methods/test_to_csv.py | 10 +-
pandas/tests/series/methods/test_to_frame.py | 4 +-
pandas/tests/series/test_arithmetic.py | 36 +-
pandas/tests/series/test_constructors.py | 55 +-
pandas/tests/series/test_dtypes.py | 67 +-
pandas/tests/series/test_reductions.py | 2 +-
pandas/tests/series/test_validate.py | 2 +-
pandas/tests/test_algos.py | 90 +-
pandas/tests/test_common.py | 8 -
pandas/tests/test_downstream.py | 1 +
pandas/tests/test_multilevel.py | 39 +-
pandas/tests/test_sorting.py | 16 +-
pandas/tests/tools/test_to_datetime.py | 31 -
pandas/tests/tools/test_to_timedelta.py | 17 -
pandas/tests/tslibs/test_array_to_datetime.py | 4 +-
pandas/tests/tslibs/test_parsing.py | 4 +-
pandas/tests/util/test_assert_almost_equal.py | 2 +-
pandas/tests/util/test_hashing.py | 17 +-
pandas/tests/window/common.py | 147 +++
pandas/tests/window/conftest.py | 83 +-
pandas/tests/window/moments/conftest.py | 77 ++
.../moments/test_moments_consistency_ewm.py | 459 ++++----
.../test_moments_consistency_expanding.py | 424 ++++----
.../test_moments_consistency_rolling.py | 550 +++++-----
.../tests/window/moments/test_moments_ewm.py | 12 +-
.../window/moments/test_moments_rolling.py | 5 +-
pandas/tests/window/test_api.py | 73 +-
pandas/tests/window/test_apply.py | 11 +
pandas/tests/window/test_ewm.py | 4 +-
pandas/tests/window/test_expanding.py | 41 +-
.../{test_groupby.py => test_grouper.py} | 303 +++---
pandas/tests/window/test_numba.py | 38 +-
pandas/tests/window/test_rolling.py | 173 +--
pandas/tests/window/test_timeseries_window.py | 19 +-
.../{test_win_type.py => test_window.py} | 57 +-
pandas/tseries/frequencies.py | 27 +-
pandas/util/_doctools.py | 20 +-
release_stats.sh | 51 +
...check_for_inconsistent_pandas_namespace.py | 49 +-
scripts/generate_pip_deps_from_conda.py | 5 +-
scripts/validate_rst_title_capitalization.py | 1 +
setup.cfg | 6 +-
setup.py | 85 +-
test.bat | 3 +
test.sh | 4 +
test_rebuild.sh | 6 +
versioneer.py | 854 ++++++---------
web/pandas/community/ecosystem.md | 2 +-
444 files changed, 9213 insertions(+), 14254 deletions(-)
delete mode 100644 asv_bench/benchmarks/hash_functions.py
create mode 100755 ci/build39.sh
create mode 100755 ci/check_cache.sh
delete mode 100644 ci/deps/azure-39.yaml
delete mode 100644 doc/source/development/test_writing.rst
create mode 100644 doc/source/reference/panel.rst
delete mode 100644 doc/source/user_guide/window.rst
delete mode 100644 pandas/_libs/khash_for_primitive_helper.pxi.in
delete mode 100644 pandas/core/arrays/string_arrow.py
delete mode 100644 pandas/tests/arrays/interval/test_astype.py
delete mode 100644 pandas/tests/arrays/string_/test_string_arrow.py
delete mode 100644 pandas/tests/frame/test_ufunc.py
create mode 100644 pandas/tests/generic/test_logical_ops.py
delete mode 100644 pandas/tests/indexes/base_class/test_formats.py
delete mode 100644 pandas/tests/indexes/categorical/test_equals.py
delete mode 100644 pandas/tests/indexes/interval/test_equals.py
delete mode 100644 pandas/tests/indexes/numeric/test_setops.py
delete mode 100644 pandas/tests/indexes/test_datetimelike.py
delete mode 100644 pandas/tests/io/formats/data/html/various_dtypes_formatted.html
delete mode 100644 pandas/tests/libs/test_hashtable.py
delete mode 100644 pandas/tests/reshape/merge/test_merge_cross.py
create mode 100644 pandas/tests/window/common.py
create mode 100644 pandas/tests/window/moments/conftest.py
rename pandas/tests/window/{test_groupby.py => test_grouper.py} (77%)
rename pandas/tests/window/{test_win_type.py => test_window.py} (57%)
create mode 100755 release_stats.sh
create mode 100644 test.bat
create mode 100755 test.sh
create mode 100755 test_rebuild.sh
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c00cec450c85e..b391871b18245 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,7 +18,7 @@ jobs:
steps:
- name: Setting conda path
- run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
+ run: echo "::add-path::${HOME}/miniconda3/bin"
- name: Checkout
uses: actions/checkout@v1
@@ -98,7 +98,7 @@ jobs:
steps:
- name: Setting conda path
- run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
+ run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}"
- name: Checkout
uses: actions/checkout@v1
diff --git a/.gitignore b/.gitignore
index 1661862a5d066..6c3c275c48fb7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,6 @@
*.log
*.swp
*.pdb
-*.zip
.project
.pydevproject
.settings
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 717334bfe1299..f9b396715664a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@ repos:
name: isort (cython)
types: [cython]
- repo: https://github.com/asottile/pyupgrade
- rev: v2.7.4
+ rev: v2.7.3
hooks:
- id: pyupgrade
args: [--py37-plus]
diff --git a/.travis.yml b/.travis.yml
index 1ddd886699d38..2bf72bd159fc2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,6 +35,11 @@ matrix:
fast_finish: true
include:
+ - dist: bionic
+ python: 3.9-dev
+ env:
+ - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
+
- env:
- JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1"
services:
@@ -89,7 +94,7 @@ install:
script:
- echo "script start"
- echo "$JOB"
- - source activate pandas-dev
+ - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi
- ci/run_tests.sh
after_script:
diff --git a/Dockerfile b/Dockerfile
index 5d7a2b9e6b743..b8aff5d671dcf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,5 +43,5 @@ RUN conda env update -n base -f "$pandas_home/environment.yml"
# Build C extensions and pandas
RUN cd "$pandas_home" \
- && python setup.py build_ext -j 4 \
+ && python setup.py build_ext --inplace -j 4 \
&& python -m pip install -e .
diff --git a/Makefile b/Makefile
index 2c968234749f5..4f71df51de360 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ clean_pyc:
-find . -name '*.py[co]' -exec rm {} \;
build: clean_pyc
- python setup.py build_ext
+ python setup.py build_ext --inplace
lint-diff:
git diff upstream/master --name-only -- "*.py" | xargs flake8
diff --git a/README.md b/README.md
index 4072faffe3b3a..a2f2f1c04442a 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ Here are just a few of the things that pandas does well:
and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
- [**Time series**][timeseries]-specific functionality: date range
generation and frequency conversion, moving window statistics,
- date shifting and lagging
+ date shifting and lagging.
[missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 03480ae198345..65e52e03c43c7 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -5,7 +5,6 @@
from pandas._libs import lib
import pandas as pd
-from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
from .pandas_vb_common import tm
@@ -175,15 +174,4 @@ def time_argsort(self, N):
self.array.argsort()
-class RemoveDuplicates:
- def setup(self):
- N = 10 ** 5
- na = np.arange(int(N / 2))
- self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
- self.right = np.concatenate([na, na])
-
- def time_make_duplicates_of_left_unique_in_right(self):
- make_duplicates_of_left_unique_in_right(self.left, self.right)
-
-
from .pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index f3b005b704014..a0b24342091ec 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -1,5 +1,3 @@
-import string
-import sys
import warnings
import numpy as np
@@ -69,47 +67,6 @@ def time_existing_series(self):
pd.Categorical(self.series)
-class AsType:
- def setup(self):
- N = 10 ** 5
-
- random_pick = np.random.default_rng().choice
-
- categories = {
- "str": list(string.ascii_letters),
- "int": np.random.randint(2 ** 16, size=154),
- "float": sys.maxsize * np.random.random((38,)),
- "timestamp": [
- pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
- ],
- }
-
- self.df = pd.DataFrame(
- {col: random_pick(cats, N) for col, cats in categories.items()}
- )
-
- for col in ("int", "float", "timestamp"):
- self.df[col + "_as_str"] = self.df[col].astype(str)
-
- for col in self.df.columns:
- self.df[col] = self.df[col].astype("category")
-
- def astype_str(self):
- [self.df[col].astype("str") for col in "int float timestamp".split()]
-
- def astype_int(self):
- [self.df[col].astype("int") for col in "int_as_str timestamp".split()]
-
- def astype_float(self):
- [
- self.df[col].astype("float")
- for col in "float_as_str int int_as_str timestamp".split()
- ]
-
- def astype_datetime(self):
- self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
-
-
class Concat:
def setup(self):
N = 10 ** 5
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 6ce63ff8badca..22f002e6cb79a 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -486,7 +486,7 @@ def setup(self):
tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
tmp = np.concatenate((tmp1, tmp2))
arr = np.repeat(tmp, 10)
- self.df = DataFrame({"a": arr, "b": arr})
+ self.df = DataFrame(dict(a=arr, b=arr))
def time_sum(self):
self.df.groupby(["a"])["b"].sum()
diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py
deleted file mode 100644
index 17bf434acf38a..0000000000000
--- a/asv_bench/benchmarks/hash_functions.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import numpy as np
-
-import pandas as pd
-
-
-class IsinAlmostFullWithRandomInt:
- params = [
- [np.float64, np.int64, np.uint64, np.object],
- range(10, 21),
- ]
- param_names = ["dtype", "exponent"]
-
- def setup(self, dtype, exponent):
- M = 3 * 2 ** (exponent - 2)
- # 0.77-the maximal share of occupied buckets
- np.random.seed(42)
- self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
- self.values = np.random.randint(0, M, M).astype(dtype)
- self.values_outside = self.values + M
-
- def time_isin(self, dtype, exponent):
- self.s.isin(self.values)
-
- def time_isin_outside(self, dtype, exponent):
- self.s.isin(self.values_outside)
-
-
-class IsinWithRandomFloat:
- params = [
- [np.float64, np.object],
- [
- 1_300,
- 2_000,
- 7_000,
- 8_000,
- 70_000,
- 80_000,
- 750_000,
- 900_000,
- ],
- ]
- param_names = ["dtype", "M"]
-
- def setup(self, dtype, M):
- np.random.seed(42)
- self.values = np.random.rand(M)
- self.s = pd.Series(self.values).astype(dtype)
- np.random.shuffle(self.values)
- self.values_outside = self.values + 0.1
-
- def time_isin(self, dtype, M):
- self.s.isin(self.values)
-
- def time_isin_outside(self, dtype, M):
- self.s.isin(self.values_outside)
-
-
-class IsinWithArangeSorted:
- params = [
- [np.float64, np.int64, np.uint64, np.object],
- [
- 1_000,
- 2_000,
- 8_000,
- 100_000,
- 1_000_000,
- ],
- ]
- param_names = ["dtype", "M"]
-
- def setup(self, dtype, M):
- self.s = pd.Series(np.arange(M)).astype(dtype)
- self.values = np.arange(M).astype(dtype)
-
- def time_isin(self, dtype, M):
- self.s.isin(self.values)
-
-
-class IsinWithArange:
- params = [
- [np.float64, np.int64, np.uint64, np.object],
- [
- 1_000,
- 2_000,
- 8_000,
- ],
- [-2, 0, 2],
- ]
- param_names = ["dtype", "M", "offset_factor"]
-
- def setup(self, dtype, M, offset_factor):
- offset = int(M * offset_factor)
- np.random.seed(42)
- tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
- self.s = tmp.astype(dtype)
- self.values = np.arange(M).astype(dtype)
-
- def time_isin(self, dtype, M, offset_factor):
- self.s.isin(self.values)
-
-
-class Float64GroupIndex:
- # GH28303
- def setup(self):
- self.df = pd.date_range(
- start="1/1/2018", end="1/2/2018", periods=1e6
- ).to_frame()
- self.group_index = np.round(self.df.index.astype(int) / 1e9)
-
- def time_groupby(self):
- self.df.groupby(self.group_index).last()
-
-
-class UniqueAndFactorizeArange:
- params = range(4, 16)
- param_names = ["exponent"]
-
- def setup(self, exponent):
- a = np.arange(10 ** 4, dtype="float64")
- self.a2 = (a + 10 ** exponent).repeat(100)
-
- def time_factorize(self, exponent):
- pd.factorize(self.a2)
-
- def time_unique(self, exponent):
- pd.unique(self.a2)
-
-
-class NumericSeriesIndexing:
-
- params = [
- (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
- (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
- ]
- param_names = ["index_dtype", "N"]
-
- def setup(self, index, N):
- vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
- indices = index(vals)
- self.data = pd.Series(np.arange(N), index=indices)
-
- def time_loc_slice(self, index, N):
- # trigger building of mapping
- self.data.loc[:800]
-
-
-class NumericSeriesIndexingShuffled:
-
- params = [
- (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
- (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
- ]
- param_names = ["index_dtype", "N"]
-
- def setup(self, index, N):
- vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
- np.random.seed(42)
- np.random.shuffle(vals)
- indices = index(vals)
- self.data = pd.Series(np.arange(N), index=indices)
-
- def time_loc_slice(self, index, N):
- # trigger building of mapping
- self.data.loc[:800]
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index a572b8a70a680..1333b3a0f0560 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -132,9 +132,6 @@ def time_join_dataframe_index_single_key_small(self, sort):
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
self.df_shuf.join(self.df_key2, on="key2", sort=sort)
- def time_join_dataframes_cross(self, sort):
- self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort)
-
class JoinIndex:
def setup(self):
@@ -208,9 +205,6 @@ def time_merge_dataframe_integer_2key(self, sort):
def time_merge_dataframe_integer_key(self, sort):
merge(self.df, self.df2, on="key1", sort=sort)
- def time_merge_dataframes_cross(self, sort):
- merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)
-
class I8Merge:
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
index 9cec8a5f7d318..21081ee23a773 100644
--- a/asv_bench/benchmarks/reshape.py
+++ b/asv_bench/benchmarks/reshape.py
@@ -103,10 +103,7 @@ def setup(self):
nidvars = 20
N = 5000
self.letters = list("ABCD")
- yrvars = [
- letter + str(num)
- for letter, num in product(self.letters, range(1, nyrs + 1))
- ]
+ yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
columns = [str(i) for i in range(nidvars)] + yrvars
self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
self.df["id"] = self.df.index
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 79a33c437ea5c..226b225b47591 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -225,17 +225,4 @@ def time_rolling_offset(self, method):
getattr(self.groupby_roll_offset, method)()
-class GroupbyEWM:
-
- params = ["cython", "numba"]
- param_names = ["engine"]
-
- def setup(self, engine):
- df = pd.DataFrame({"A": range(50), "B": range(50)})
- self.gb_ewm = df.groupby("A").ewm(com=1.0)
-
- def time_groupby_mean(self, engine):
- self.gb_ewm.mean(engine=engine)
-
-
from .pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 2db46abca119c..258c29c145721 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -2,7 +2,7 @@
import numpy as np
-from pandas import Categorical, NaT, Series, date_range
+from pandas import NaT, Series, date_range
from .pandas_vb_common import tm
@@ -36,28 +36,6 @@ def time_isin(self, dtypes):
self.s.isin(self.values)
-class IsInDatetime64:
- def setup(self):
- dti = date_range(
- start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
- )
- self.ser = Series(dti)
- self.subset = self.ser._values[::3]
- self.cat_subset = Categorical(self.subset)
-
- def time_isin(self):
- self.ser.isin(self.subset)
-
- def time_isin_cat_values(self):
- self.ser.isin(self.cat_subset)
-
- def time_isin_mismatched_dtype(self):
- self.ser.isin([1, 2])
-
- def time_isin_empty(self):
- self.ser.isin([])
-
-
class IsInFloat64:
def setup(self):
self.small = Series([1, 2], dtype=np.float64)
@@ -112,55 +90,6 @@ def time_isin_long_series_long_values_floats(self):
self.s_long_floats.isin(self.vals_long_floats)
-class IsInLongSeriesLookUpDominates:
- params = [
- ["int64", "int32", "float64", "float32", "object"],
- [5, 1000],
- ["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
- ]
- param_names = ["dtype", "MaxNumber", "series_type"]
-
- def setup(self, dtype, MaxNumber, series_type):
- N = 10 ** 7
- if series_type == "random_hits":
- np.random.seed(42)
- array = np.random.randint(0, MaxNumber, N)
- if series_type == "random_misses":
- np.random.seed(42)
- array = np.random.randint(0, MaxNumber, N) + MaxNumber
- if series_type == "monotone_hits":
- array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
- if series_type == "monotone_misses":
- array = np.arange(N) + MaxNumber
- self.series = Series(array).astype(dtype)
- self.values = np.arange(MaxNumber).astype(dtype)
-
- def time_isin(self, dtypes, MaxNumber, series_type):
- self.series.isin(self.values)
-
-
-class IsInLongSeriesValuesDominate:
- params = [
- ["int64", "int32", "float64", "float32", "object"],
- ["random", "monotone"],
- ]
- param_names = ["dtype", "series_type"]
-
- def setup(self, dtype, series_type):
- N = 10 ** 7
- if series_type == "random":
- np.random.seed(42)
- vals = np.random.randint(0, 10 * N, N)
- if series_type == "monotone":
- vals = np.arange(N)
- self.values = vals.astype(dtype)
- M = 10 ** 6 + 1
- self.series = Series(np.arange(M)).astype(dtype)
-
- def time_isin(self, dtypes, series_type):
- self.series.isin(self.values)
-
-
class NSort:
params = ["first", "last", "all"]
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c49742095e1d8..b1091ea7f60e4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -40,7 +40,7 @@ jobs:
. ~/virtualenvs/pandas-dev/bin/activate && \
python -m pip install --no-deps -U pip wheel setuptools && \
pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \
- python setup.py build_ext -q -j2 && \
+ python setup.py build_ext -q -i -j2 && \
python -m pip install --no-build-isolation -e . && \
pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml"
displayName: 'Run 32-bit manylinux2014 Docker Build / Tests'
diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
index 8e44db0b4bcd4..3a9bb14470692 100644
--- a/ci/azure/posix.yml
+++ b/ci/azure/posix.yml
@@ -61,11 +61,6 @@ jobs:
PANDAS_TESTING_MODE: "deprecate"
EXTRA_APT: "xsel"
- py39:
- ENV_FILE: ci/deps/azure-39.yaml
- CONDA_PY: "39"
- PATTERN: "not slow and not network and not clipboard"
-
steps:
- script: |
if [ "$(uname)" == "Linux" ]; then
diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml
index e510f4115b25f..601a834d6306a 100644
--- a/ci/azure/windows.yml
+++ b/ci/azure/windows.yml
@@ -34,7 +34,7 @@ jobs:
- bash: |
source activate pandas-dev
conda list
- python setup.py build_ext -q -j 4
+ python setup.py build_ext -q -i -j 4
python -m pip install --no-build-isolation -e .
displayName: 'Build'
diff --git a/ci/build39.sh b/ci/build39.sh
new file mode 100755
index 0000000000000..faef2be03c2bb
--- /dev/null
+++ b/ci/build39.sh
@@ -0,0 +1,12 @@
+#!/bin/bash -e
+# Special build for python3.9 until numpy puts its own wheels up
+
+pip install --no-deps -U pip wheel setuptools
+pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis
+
+python setup.py build_ext -inplace
+python -m pip install --no-build-isolation -e .
+
+python -c "import sys; print(sys.version_info)"
+python -c "import pandas as pd"
+python -c "import hypothesis"
diff --git a/ci/check_cache.sh b/ci/check_cache.sh
new file mode 100755
index 0000000000000..b83144fc45ef4
--- /dev/null
+++ b/ci/check_cache.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# currently not used
+# script to make sure that cache is clean
+# Travis CI now handles this
+
+if [ "$TRAVIS_PULL_REQUEST" == "false" ]
+then
+ echo "Not a PR: checking for changes in ci/ from last 2 commits"
+ git diff HEAD~2 --numstat | grep -E "ci/"
+ ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l)
+else
+ echo "PR: checking for changes in ci/ from last 2 commits"
+ git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD
+ git diff PR_HEAD~2 --numstat | grep -E "ci/"
+ ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l)
+fi
+
+CACHE_DIR="$HOME/.cache/"
+CCACHE_DIR="$HOME/.ccache/"
+
+if [ $ci_changes -ne 0 ]
+then
+ echo "Files have changed in ci/ deleting all caches"
+ rm -rf "$CACHE_DIR"
+ rm -rf "$CCACHE_DIR"
+fi
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3eeee61f62a7e..b5a6e32caa8e0 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -225,7 +225,7 @@ fi
### DOCSTRINGS ###
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
- MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS01, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG
+ MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS02, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03)' ; echo $MSG
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS02,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03
RET=$(($RET + $?)) ; echo $MSG "DONE"
diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml
index f879111a32e67..8ce58e07a8542 100644
--- a/ci/deps/azure-38-locale.yaml
+++ b/ci/deps/azure-38-locale.yaml
@@ -34,7 +34,7 @@ dependencies:
- xlsxwriter
- xlwt
- moto
- - pyarrow=1.0.0
+ - pyarrow>=0.15
- pip
- pip:
- pyxlsb
diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml
deleted file mode 100644
index c4c84e73fa684..0000000000000
--- a/ci/deps/azure-39.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: pandas-dev
-channels:
- - conda-forge
-dependencies:
- - python=3.9.*
-
- # tools
- - cython>=0.29.21
- - pytest>=5.0.1
- - pytest-xdist>=1.21
- - hypothesis>=3.58.0
- - pytest-azurepipelines
-
- # pandas dependencies
- - numpy
- - python-dateutil
- - pytz
-
- # optional dependencies
- - pytables
- - scipy
- - pyarrow=1.0
diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml
index 4e442b10482a7..e93a86910bf34 100644
--- a/ci/deps/travis-37-locale.yaml
+++ b/ci/deps/travis-37-locale.yaml
@@ -34,7 +34,7 @@ dependencies:
- pyarrow>=0.17
- pytables>=3.5.1
- scipy
- - xarray=0.12.3
+ - xarray=0.12.0
- xlrd
- xlsxwriter
- xlwt
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index 78d24c814840a..9b553fbc81a03 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
# GH#37455 windows py38 build appears to be running out of memory
# skip collection of window tests
- PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/"
+ PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/"
fi
echo $PYTEST_CMD
diff --git a/ci/setup_env.sh b/ci/setup_env.sh
index c36422884f2ec..247f809c5fe63 100755
--- a/ci/setup_env.sh
+++ b/ci/setup_env.sh
@@ -1,5 +1,10 @@
#!/bin/bash -e
+if [ "$JOB" == "3.9-dev" ]; then
+ /bin/bash ci/build39.sh
+ exit 0
+fi
+
# edit the locale file if needed
if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then
echo "Adding locale to the first line of pandas/__init__.py"
@@ -108,12 +113,6 @@ fi
echo "activate pandas-dev"
source activate pandas-dev
-# Explicitly set an environment variable indicating that this is pandas' CI environment.
-#
-# This allows us to enable things like -Werror that shouldn't be activated in
-# downstream CI jobs that may also build pandas from source.
-export PANDAS_CI=1
-
echo
echo "remove any installed pandas package"
echo "w/o removing anything else"
@@ -137,7 +136,7 @@ conda list pandas
# Make sure any error below is reported as such
echo "[Build extensions]"
-python setup.py build_ext -q -j2
+python setup.py build_ext -q -i -j2
echo "[Updating pip]"
python -m pip install --no-deps -U pip wheel setuptools
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index 3c5a88333be56..4261d79a5e3f5 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -146,7 +146,7 @@ Creating a development environment
----------------------------------
To test out code changes, you'll need to build pandas from source, which
-requires a C/C++ compiler and Python environment. If you're making documentation
+requires a C compiler and Python environment. If you're making documentation
changes, you can skip to :ref:`contributing.documentation` but you won't be able
to build the documentation locally before pushing your changes.
@@ -183,7 +183,7 @@ See https://www.jetbrains.com/help/pycharm/docker.html for details.
Note that you might need to rebuild the C extensions if/when you merge with upstream/master using::
- python setup.py build_ext -j 4
+ python setup.py build_ext --inplace -j 4
.. _contributing.dev_c:
@@ -195,13 +195,6 @@ operations. To install pandas from source, you need to compile these C
extensions, which means you need a C compiler. This process depends on which
platform you're using.
-If you have setup your environment using ``conda``, the packages ``c-compiler``
-and ``cxx-compiler`` will install a fitting compiler for your platform that is
-compatible with the remaining conda packages. On Windows and macOS, you will
-also need to install the SDKs as they have to be distributed separately.
-These packages will be automatically installed by using ``pandas``'s
-``environment.yml``.
-
**Windows**
You will need `Build Tools for Visual Studio 2017
@@ -213,33 +206,12 @@ You will need `Build Tools for Visual Studio 2017
scrolling down to "All downloads" -> "Tools for Visual Studio 2019".
In the installer, select the "C++ build tools" workload.
-You can install the necessary components on the commandline using
-`vs_buildtools.exe `_:
-
-.. code::
-
- vs_buildtools.exe --quiet --wait --norestart --nocache ^
- --installPath C:\BuildTools ^
- --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^
- --add Microsoft.VisualStudio.Component.VC.v141 ^
- --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^
- --add Microsoft.VisualStudio.Component.Windows10SDK.17763
-
-To setup the right paths on the commandline, call
-``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``.
-
**macOS**
-To use the ``conda``-based compilers, you will need to install the
-Developer Tools using ``xcode-select --install``. Otherwise
-information about compiler installation can be found here:
+Information about compiler installation can be found here:
https://devguide.python.org/setup/#macos
-**Linux**
-
-For Linux-based ``conda`` installations, you won't have to install any
-additional components outside of the conda environment. The instructions
-below are only needed if your setup isn't based on conda environments.
+**Unix**
Some Linux distributions will come with a pre-installed C compiler. To find out
which compilers (and versions) are installed on your system::
@@ -271,10 +243,11 @@ Let us know if you have any difficulties by opening an issue or reaching out on
Creating a Python environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Now create an isolated pandas development environment:
+Now that you have a C compiler, create an isolated pandas development
+environment:
-* Install either `Anaconda `_, `miniconda
- `_, or `miniforge `_
+* Install either `Anaconda `_ or `miniconda
+ `_
* Make sure your conda is up to date (``conda update conda``)
* Make sure that you have :ref:`cloned the repository `
* ``cd`` to the pandas source directory
@@ -295,7 +268,7 @@ We'll now kick off a three-step process:
source activate pandas-dev
# Build and install pandas
- python setup.py build_ext -j 4
+ python setup.py build_ext --inplace -j 4
python -m pip install -e . --no-build-isolation --no-use-pep517
At this point you should be able to import pandas from your locally built version::
@@ -342,7 +315,7 @@ You'll need to have at least Python 3.6.1 installed on your system.
python -m pip install -r requirements-dev.txt
# Build and install pandas
- python setup.py build_ext -j 4
+ python setup.py build_ext --inplace -j 4
python -m pip install -e . --no-build-isolation --no-use-pep517
**Unix**/**macOS with pyenv**
@@ -366,7 +339,7 @@ Consult the docs for setting up pyenv `here `__.
python -m pip install -r requirements-dev.txt
# Build and install pandas
- python setup.py build_ext -j 4
+ python setup.py build_ext --inplace -j 4
python -m pip install -e . --no-build-isolation --no-use-pep517
**Windows**
@@ -392,7 +365,7 @@ should already exist.
python -m pip install -r requirements-dev.txt
# Build and install pandas
- python setup.py build_ext -j 4
+ python setup.py build_ext --inplace -j 4
python -m pip install -e . --no-build-isolation --no-use-pep517
Creating a branch
@@ -469,7 +442,7 @@ Some other important things to know about the docs:
contributing_docstring.rst
-* The tutorials make heavy use of the `IPython directive
+* The tutorials make heavy use of the `ipython directive
`_ sphinx extension.
This directive lets you put code in the documentation which will be run
during the doc build. For example::
diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
index 623d1e8d45565..26cdd0687706c 100644
--- a/doc/source/development/contributing_docstring.rst
+++ b/doc/source/development/contributing_docstring.rst
@@ -63,14 +63,14 @@ The first conventions every Python docstring should follow are defined in
`PEP-257 `_.
As PEP-257 is quite broad, other more specific standards also exist. In the
-case of pandas, the NumPy docstring convention is followed. These conventions are
+case of pandas, the numpy docstring convention is followed. These conventions are
explained in this document:
* `numpydoc docstring guide `_
(which is based in the original `Guide to NumPy/SciPy documentation
`_)
-numpydoc is a Sphinx extension to support the NumPy docstring convention.
+numpydoc is a Sphinx extension to support the numpy docstring convention.
The standard uses reStructuredText (reST). reStructuredText is a markup
language that allows encoding styles in plain text files. Documentation
@@ -401,7 +401,7 @@ DataFrame:
* pandas.Categorical
* pandas.arrays.SparseArray
-If the exact type is not relevant, but must be compatible with a NumPy
+If the exact type is not relevant, but must be compatible with a numpy
array, array-like can be specified. If Any type that can be iterated is
accepted, iterable can be used:
@@ -819,7 +819,7 @@ positional arguments ``head(3)``.
"""
A sample DataFrame method.
- Do not import NumPy and pandas.
+ Do not import numpy and pandas.
Try to use meaningful data, when it makes the example easier
to understand.
@@ -854,7 +854,7 @@ Tips for getting your examples pass the doctests
Getting the examples pass the doctests in the validation script can sometimes
be tricky. Here are some attention points:
-* Import all needed libraries (except for pandas and NumPy, those are already
+* Import all needed libraries (except for pandas and numpy, those are already
imported as ``import pandas as pd`` and ``import numpy as np``) and define
all variables you use in the example.
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
index d4219296f5795..77fe930cf21e3 100644
--- a/doc/source/development/extending.rst
+++ b/doc/source/development/extending.rst
@@ -219,7 +219,7 @@ and re-boxes it if necessary.
If applicable, we highly recommend that you implement ``__array_ufunc__`` in your
extension array to avoid coercion to an ndarray. See
-`the NumPy documentation `__
+`the numpy documentation `__
for an example.
As part of your implementation, we require that you defer to pandas when a pandas
diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst
index e842c827b417f..f8a6bb6deb52d 100644
--- a/doc/source/development/index.rst
+++ b/doc/source/development/index.rst
@@ -16,7 +16,6 @@ Development
code_style
maintaining
internals
- test_writing
extending
developer
policies
diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst
index f8e6bda2085d8..ced5b686b8246 100644
--- a/doc/source/development/policies.rst
+++ b/doc/source/development/policies.rst
@@ -35,7 +35,7 @@ We will not introduce new deprecations in patch releases.
Deprecations will only be enforced in **major** releases. For example, if a
behavior is deprecated in pandas 1.2.0, it will continue to work, with a
warning, for all releases in the 1.x series. The behavior will change and the
-deprecation removed in the next major release (2.0.0).
+deprecation removed in the next next major release (2.0.0).
.. note::
diff --git a/doc/source/development/test_writing.rst b/doc/source/development/test_writing.rst
deleted file mode 100644
index d9e24bb76eed8..0000000000000
--- a/doc/source/development/test_writing.rst
+++ /dev/null
@@ -1,174 +0,0 @@
-.. _test_organization:
-
-Test organization
-=================
-Ideally, there should be one, and only one, obvious place for a test to reside.
-Until we reach that ideal, these are some rules of thumb for where a test should
-be located.
-
-1. Does your test depend only on code in ``pd._libs.tslibs``?
- This test likely belongs in one of:
-
- - tests.tslibs
-
- .. note::
-
- No file in ``tests.tslibs`` should import from any pandas modules
- outside of ``pd._libs.tslibs``
-
- - tests.scalar
- - tests.tseries.offsets
-
-2. Does your test depend only on code in pd._libs?
- This test likely belongs in one of:
-
- - tests.libs
- - tests.groupby.test_libgroupby
-
-3. Is your test for an arithmetic or comparison method?
- This test likely belongs in one of:
-
- - tests.arithmetic
-
- .. note::
-
- These are intended for tests that can be shared to test the behavior
- of DataFrame/Series/Index/ExtensionArray using the ``box_with_array``
- fixture.
-
- - tests.frame.test_arithmetic
- - tests.series.test_arithmetic
-
-4. Is your test for a reduction method (min, max, sum, prod, ...)?
- This test likely belongs in one of:
-
- - tests.reductions
-
- .. note::
-
- These are intended for tests that can be shared to test the behavior
- of DataFrame/Series/Index/ExtensionArray.
-
- - tests.frame.test_reductions
- - tests.series.test_reductions
- - tests.test_nanops
-
-5. Is your test for an indexing method?
- This is the most difficult case for deciding where a test belongs, because
- there are many of these tests, and many of them test more than one method
- (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``)
-
- A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``,
- ``Index.get_indexer``)?
- This test likely belongs in one of:
-
- - tests.indexes.test_indexing
- - tests.indexes.fooindex.test_indexing
-
- Within that files there should be a method-specific test class e.g.
- ``TestGetLoc``.
-
- In most cases, neither ``Series`` nor ``DataFrame`` objects should be
- needed in these tests.
-
- B) Is the test for a Series or DataFrame indexing method *other* than
- ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``,
- ``mask``, ``lookup``, or ``insert``?
- This test likely belongs in one of:
-
- - tests.frame.indexing.test_methodname
- - tests.series.indexing.test_methodname
-
- C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``?
- This test likely belongs in one of:
-
- - tests.indexing.test_loc
- - tests.indexing.test_iloc
- - tests.indexing.test_at
- - tests.indexing.test_iat
-
- Within the appropriate file, test classes correspond to either types of
- indexers (e.g. ``TestLocBooleanMask``) or major use cases
- (e.g. ``TestLocSetitemWithExpansion``).
-
- See the note in section D) about tests that test multiple indexing methods.
-
- D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``,
- ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``?
- This test likely belongs in one of:
-
- - tests.series.test_getitem
- - tests.series.test_setitem
- - tests.frame.test_getitem
- - tests.frame.test_setitem
-
- If many cases such a test may test multiple similar methods, e.g.
-
- .. code-block:: python
-
- import pandas as pd
- import pandas._testing as tm
-
- def test_getitem_listlike_of_ints():
- ser = pd.Series(range(5))
-
- result = ser[[3, 4]]
- expected = pd.Series([2, 3])
- tm.assert_series_equal(result, expected)
-
- result = ser.loc[[3, 4]]
- tm.assert_series_equal(result, expected)
-
- In cases like this, the test location should be based on the *underlying*
- method being tested. Or in the case of a test for a bugfix, the location
- of the actual bug. So in this example, we know that ``Series.__getitem__``
- calls ``Series.loc.__getitem__``, so this is *really* a test for
- ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``.
-
-6. Is your test for a DataFrame or Series method?
-
- A) Is the method a plotting method?
- This test likely belongs in one of:
-
- - tests.plotting
-
- B) Is the method an IO method?
- This test likely belongs in one of:
-
- - tests.io
-
- C) Otherwise
- This test likely belongs in one of:
-
- - tests.series.methods.test_mymethod
- - tests.frame.methods.test_mymethod
-
- .. note::
-
- If a test can be shared between DataFrame/Series using the
- ``frame_or_series`` fixture, by convention it goes in the
- ``tests.frame`` file.
-
- - tests.generic.methods.test_mymethod
-
- .. note::
-
- The generic/methods/ directory is only for methods with tests
- that are fully parametrized over Series/DataFrame
-
-7. Is your test for an Index method, not depending on Series/DataFrame?
- This test likely belongs in one of:
-
- - tests.indexes
-
-8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``,
- ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``,
- ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)?
- This test likely belongs in one of:
-
- - tests.arrays
-
-9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")?
- This test likely belongs in one of:
-
- - tests.extension
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index e88875a9f679c..670905f6587bc 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -174,20 +174,10 @@ invoked with the following command
dtale.show(df)
-D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle
+D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle
& Google Colab. Here are some demos of the `grid `__
and `chart-builder `__.
-`hvplot `__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__.
-It can be loaded as a native pandas plotting backend via
-
-.. code:: python
-
- pd.set_option("plotting.backend", "hvplot")
-
.. _ecosystem.ide:
IDE
@@ -431,7 +421,7 @@ If also displays progress bars.
`Vaex `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted).
+Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted).
* vaex.from_pandas
* vaex.to_pandas_df
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index c823ad01f10bf..df481e8c986f7 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -284,7 +284,7 @@ pyxlsb 1.0.6 Reading for xlsb files
qtpy Clipboard I/O
s3fs 0.4.0 Amazon S3 access
tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_)
-xarray 0.12.3 pandas-like API for N-dimensional data
+xarray 0.12.0 pandas-like API for N-dimensional data
xclip Clipboard I/O on linux
xlrd 1.2.0 Excel reading
xlwt 1.3.0 Excel writing
diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst
index b7a566a35084d..991c2bbe0fba6 100644
--- a/doc/source/getting_started/intro_tutorials/04_plotting.rst
+++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst
@@ -131,8 +131,8 @@ standard Python to get an overview of the available plot methods:
]
.. note::
- In many development environments as well as IPython and
- Jupyter Notebook, use the TAB button to get an overview of the available
+ In many development environments as well as ipython and
+ jupyter notebook, use the TAB button to get an overview of the available
methods, for example ``air_quality.plot.`` + TAB.
One of the options is :meth:`DataFrame.plot.box`, which refers to a
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index f7c5eaf242b34..9d5649c37e92f 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -30,6 +30,7 @@ public functions related to data types in pandas.
series
frame
arrays
+ panel
indexing
offset_frequency
window
diff --git a/doc/source/reference/panel.rst b/doc/source/reference/panel.rst
new file mode 100644
index 0000000000000..37d48c2dadf2e
--- /dev/null
+++ b/doc/source/reference/panel.rst
@@ -0,0 +1,10 @@
+{{ header }}
+
+.. _api.panel:
+
+=====
+Panel
+=====
+.. currentmodule:: pandas
+
+``Panel`` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_
diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst
index e80dc1b57ff80..24a47336b0522 100644
--- a/doc/source/reference/style.rst
+++ b/doc/source/reference/style.rst
@@ -36,7 +36,6 @@ Style application
Styler.where
Styler.format
Styler.set_precision
- Styler.set_td_classes
Styler.set_table_styles
Styler.set_table_attributes
Styler.set_caption
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
index a255b3ae8081e..77697b966df18 100644
--- a/doc/source/reference/window.rst
+++ b/doc/source/reference/window.rst
@@ -10,10 +10,8 @@ Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.roll
Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc.
ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc.
-.. _api.functions_rolling:
-
-Rolling window functions
-------------------------
+Standard moving window functions
+--------------------------------
.. currentmodule:: pandas.core.window.rolling
.. autosummary::
@@ -35,16 +33,6 @@ Rolling window functions
Rolling.aggregate
Rolling.quantile
Rolling.sem
-
-.. _api.functions_window:
-
-Weighted window functions
--------------------------
-.. currentmodule:: pandas.core.window.rolling
-
-.. autosummary::
- :toctree: api/
-
Window.mean
Window.sum
Window.var
@@ -52,8 +40,8 @@ Weighted window functions
.. _api.functions_expanding:
-Expanding window functions
---------------------------
+Standard expanding window functions
+-----------------------------------
.. currentmodule:: pandas.core.window.expanding
.. autosummary::
@@ -76,10 +64,8 @@ Expanding window functions
Expanding.quantile
Expanding.sem
-.. _api.functions_ewm:
-
-Exponentially-weighted window functions
----------------------------------------
+Exponentially-weighted moving window functions
+----------------------------------------------
.. currentmodule:: pandas.core.window.ewm
.. autosummary::
@@ -91,8 +77,6 @@ Exponentially-weighted window functions
ExponentialMovingWindow.corr
ExponentialMovingWindow.cov
-.. _api.indexers_window:
-
Window indexer
--------------
.. currentmodule:: pandas
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
index cf548ba5d1133..08f83a4674ada 100644
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -239,13 +239,13 @@ Select via the position of the passed integers:
df.iloc[3]
-By integer slices, acting similar to numpy/Python:
+By integer slices, acting similar to numpy/python:
.. ipython:: python
df.iloc[3:5, 0:2]
-By lists of integer position locations, similar to the NumPy/Python style:
+By lists of integer position locations, similar to the numpy/python style:
.. ipython:: python
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
index ffecaa222e1f9..53fabf94e24e0 100644
--- a/doc/source/user_guide/basics.rst
+++ b/doc/source/user_guide/basics.rst
@@ -538,8 +538,8 @@ standard deviation of 1), very concisely:
Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`
preserve the location of ``NaN`` values. This is somewhat different from
-:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling` since ``NaN`` behavior
-is furthermore dictated by a ``min_periods`` parameter.
+:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`.
+For more details please see :ref:`this note `.
.. ipython:: python
@@ -845,7 +845,7 @@ For example, we can fit a regression using statsmodels. Their API expects a form
The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which
have introduced the popular ``(%>%)`` (read pipe) operator for R_.
-The implementation of ``pipe`` here is quite clean and feels right at home in Python.
+The implementation of ``pipe`` here is quite clean and feels right at home in python.
We encourage you to view the source code of :meth:`~DataFrame.pipe`.
.. _dplyr: https://github.com/hadley/dplyr
@@ -945,7 +945,7 @@ Aggregation API
The aggregation API allows one to express possibly multiple aggregation operations in a single concise way.
This API is similar across pandas objects, see :ref:`groupby API `, the
-:ref:`window API `, and the :ref:`resample API `.
+:ref:`window functions API `, and the :ref:`resample API `.
The entry point for aggregation is :meth:`DataFrame.aggregate`, or the alias
:meth:`DataFrame.agg`.
@@ -2203,7 +2203,7 @@ You can use the :meth:`~DataFrame.astype` method to explicitly convert dtypes fr
even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an
exception if the astype operation is invalid.
-Upcasting is always according to the **NumPy** rules. If two different dtypes are involved in an operation,
+Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation,
then the more *general* one will be used as the result of the operation.
.. ipython:: python
diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst
index 17d1809638d61..45d15f29fcce8 100644
--- a/doc/source/user_guide/computation.rst
+++ b/doc/source/user_guide/computation.rst
@@ -206,9 +206,990 @@ parameter:
- ``max`` : highest rank in the group
- ``first`` : ranks assigned in the order they appear in the array
-.. _computation.windowing:
+.. _stats.moments:
-Windowing functions
-~~~~~~~~~~~~~~~~~~~
+Window functions
+----------------
-See :ref:`the window operations user guide ` for an overview of windowing functions.
+.. currentmodule:: pandas.core.window
+
+For working with data, a number of window functions are provided for
+computing common *window* or *rolling* statistics. Among these are count, sum,
+mean, median, correlation, variance, covariance, standard deviation, skewness,
+and kurtosis.
+
+The ``rolling()`` and ``expanding()``
+functions can be used directly from DataFrameGroupBy objects,
+see the :ref:`groupby docs `.
+
+
+.. note::
+
+ The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here `.
+
+.. warning::
+
+ When using ``rolling()`` and an associated function the results are calculated with rolling sums. As a consequence
+ when having values differing with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be
+ noted, that large values may have an impact on windows, which do not include these values. `Kahan summation
+ `__ is used
+ to compute the rolling sums to preserve accuracy as much as possible. The same holds true for ``Rolling.var()`` for
+ values differing with magnitude :math:`(1/np.finfo(np.double).eps)^{0.5}`.
+
+We work with ``rolling``, ``expanding`` and ``exponentially weighted`` data through the corresponding
+objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.ExponentialMovingWindow`.
+
+.. ipython:: python
+
+ s = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))
+ s = s.cumsum()
+ s
+
+These are created from methods on ``Series`` and ``DataFrame``.
+
+.. ipython:: python
+
+ r = s.rolling(window=60)
+ r
+
+These object provide tab-completion of the available methods and properties.
+
+.. code-block:: ipython
+
+ In [14]: r. # noqa: E225, E999
+ r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum
+ r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var
+
+Generally these methods all have the same interface. They all
+accept the following arguments:
+
+- ``window``: size of moving window
+- ``min_periods``: threshold of non-null data points to require (otherwise
+ result is NA)
+- ``center``: boolean, whether to set the labels at the center (default is False)
+
+We can then call methods on these ``rolling`` objects. These return like-indexed objects:
+
+.. ipython:: python
+
+ r.mean()
+
+.. ipython:: python
+
+ s.plot(style="k--")
+
+ @savefig rolling_mean_ex.png
+ r.mean().plot(style="k")
+
+.. ipython:: python
+ :suppress:
+
+ plt.close("all")
+
+They can also be applied to DataFrame objects. This is really just syntactic
+sugar for applying the moving window operator to all of the DataFrame's columns:
+
+.. ipython:: python
+
+ df = pd.DataFrame(
+ np.random.randn(1000, 4),
+ index=pd.date_range("1/1/2000", periods=1000),
+ columns=["A", "B", "C", "D"],
+ )
+ df = df.cumsum()
+
+ @savefig rolling_mean_frame.png
+ df.rolling(window=60).sum().plot(subplots=True)
+
+.. _stats.summary:
+
+Method summary
+~~~~~~~~~~~~~~
+
+We provide a number of common statistical functions:
+
+.. currentmodule:: pandas.core.window
+
+.. csv-table::
+ :header: "Method", "Description"
+ :widths: 20, 80
+
+ :meth:`~Rolling.count`, Number of non-null observations
+ :meth:`~Rolling.sum`, Sum of values
+ :meth:`~Rolling.mean`, Mean of values
+ :meth:`~Rolling.median`, Arithmetic median of values
+ :meth:`~Rolling.min`, Minimum
+ :meth:`~Rolling.max`, Maximum
+ :meth:`~Rolling.std`, Sample standard deviation
+ :meth:`~Rolling.var`, Sample variance
+ :meth:`~Rolling.skew`, Sample skewness (3rd moment)
+ :meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
+ :meth:`~Rolling.quantile`, Sample quantile (value at %)
+ :meth:`~Rolling.apply`, Generic apply
+ :meth:`~Rolling.cov`, Sample covariance (binary)
+ :meth:`~Rolling.corr`, Sample correlation (binary)
+ :meth:`~Rolling.sem`, Standard error of mean
+
+.. _computation.window_variance.caveats:
+
+.. note::
+
+ Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample
+ variance formula by default, i.e. the sum of squared differences is divided by
+ ``window_size - 1`` and not by ``window_size`` during averaging. In statistics,
+ we use sample when the dataset is drawn from a larger population that we
+ don't have access to. Using it implies that the data in our window is a
+ random sample from the population, and we are interested not in the variance
+ inside the specific window but in the variance of some general window that
+ our windows represent. In this situation, using the sample variance formula
+ results in an unbiased estimator and so is preferred.
+
+ Usually, we are instead interested in the variance of each window as we slide
+ it over the data, and in this case we should specify ``ddof=0`` when calling
+ these methods to use population variance instead of sample variance. Using
+ sample variance under the circumstances would result in a biased estimator
+ of the variable we are trying to determine.
+
+ The same caveats apply to using any supported statistical sample methods.
+
+.. _stats.rolling_apply:
+
+Rolling apply
+~~~~~~~~~~~~~
+
+The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs
+generic rolling computations. The ``func`` argument should be a single function
+that produces a single value from an ndarray input. Suppose we wanted to
+compute the mean absolute deviation on a rolling basis:
+
+.. ipython:: python
+
+ def mad(x):
+ return np.fabs(x - x.mean()).mean()
+
+ @savefig rolling_apply_ex.png
+ s.rolling(window=60).apply(mad, raw=True).plot(style="k")
+
+Using the Numba engine
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.0
+
+Additionally, :meth:`~Rolling.apply` can leverage `Numba `__
+if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying
+``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``).
+Numba will be applied in potentially two routines:
+
+1. If ``func`` is a standard Python function, the engine will `JIT `__
+the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
+
+2. The engine will JIT the for loop where the apply function is applied to each window.
+
+The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
+`numba.jit decorator `__.
+These keyword arguments will be applied to *both* the passed function (if a standard Python function)
+and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported,
+and their default values are set to ``False``, ``True`` and ``False`` respectively.
+
+.. note::
+
+ In terms of performance, **the first time a function is run using the Numba engine will be slow**
+ as Numba will have some function compilation overhead. However, the compiled functions are cached,
+ and subsequent calls will be fast. In general, the Numba engine is performant with
+ a larger amount of data points (e.g. 1+ million).
+
+.. code-block:: ipython
+
+ In [1]: data = pd.Series(range(1_000_000))
+
+ In [2]: roll = data.rolling(10)
+
+ In [3]: def f(x):
+ ...: return np.sum(x) + 5
+ # Run the first time, compilation time will affect performance
+ In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225
+ 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
+ # Function is cached and performance will improve
+ In [5]: %timeit roll.apply(f, engine='numba', raw=True)
+ 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+
+ In [6]: %timeit roll.apply(f, engine='cython', raw=True)
+ 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+.. _stats.rolling_window:
+
+Rolling windows
+~~~~~~~~~~~~~~~
+
+Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``.
+The following methods are available:
+
+.. csv-table::
+ :header: "Method", "Description"
+ :widths: 20, 80
+
+ :meth:`~Window.sum`, Sum of values
+ :meth:`~Window.mean`, Mean of values
+
+The weights used in the window are specified by the ``win_type`` keyword.
+The list of recognized types are the `scipy.signal window functions
+`__:
+
+* ``boxcar``
+* ``triang``
+* ``blackman``
+* ``hamming``
+* ``bartlett``
+* ``parzen``
+* ``bohman``
+* ``blackmanharris``
+* ``nuttall``
+* ``barthann``
+* ``kaiser`` (needs beta)
+* ``gaussian`` (needs std)
+* ``general_gaussian`` (needs power, width)
+* ``slepian`` (needs width)
+* ``exponential`` (needs tau).
+
+.. versionadded:: 1.2.0
+
+All Scipy window types, concurrent with your installed version, are recognized ``win_types``.
+
+.. ipython:: python
+
+ ser = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10))
+
+ ser.rolling(window=5, win_type="triang").mean()
+
+Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`.
+
+.. ipython:: python
+
+ ser.rolling(window=5, win_type="boxcar").mean()
+ ser.rolling(window=5).mean()
+
+For some windowing functions, additional parameters must be specified:
+
+.. ipython:: python
+
+ ser.rolling(window=5, win_type="gaussian").mean(std=0.1)
+
+.. _stats.moments.normalization:
+
+.. note::
+
+ For ``.sum()`` with a ``win_type``, there is no normalization done to the
+ weights for the window. Passing custom weights of ``[1, 1, 1]`` will yield a different
+ result than passing weights of ``[2, 2, 2]``, for example. When passing a
+ ``win_type`` instead of explicitly specifying the weights, the weights are
+ already normalized so that the largest weight is 1.
+
+ In contrast, the nature of the ``.mean()`` calculation is
+ such that the weights are normalized with respect to each other. Weights
+ of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result.
+
+.. _stats.moments.ts:
+
+Time-aware rolling
+~~~~~~~~~~~~~~~~~~
+
+It is possible to pass an offset (or convertible) to a ``.rolling()`` method and have it produce
+variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring
+within the indicated time delta.
+
+This can be particularly useful for a non-regular time frequency index.
+
+.. ipython:: python
+
+ dft = pd.DataFrame(
+ {"B": [0, 1, 2, np.nan, 4]},
+ index=pd.date_range("20130101 09:00:00", periods=5, freq="s"),
+ )
+ dft
+
+This is a regular frequency index. Using an integer window parameter works to roll along the window frequency.
+
+.. ipython:: python
+
+ dft.rolling(2).sum()
+ dft.rolling(2, min_periods=1).sum()
+
+Specifying an offset allows a more intuitive specification of the rolling frequency.
+
+.. ipython:: python
+
+ dft.rolling("2s").sum()
+
+Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation.
+
+
+.. ipython:: python
+
+ dft = pd.DataFrame(
+ {"B": [0, 1, 2, np.nan, 4]},
+ index=pd.Index(
+ [
+ pd.Timestamp("20130101 09:00:00"),
+ pd.Timestamp("20130101 09:00:02"),
+ pd.Timestamp("20130101 09:00:03"),
+ pd.Timestamp("20130101 09:00:05"),
+ pd.Timestamp("20130101 09:00:06"),
+ ],
+ name="foo",
+ ),
+ )
+ dft
+ dft.rolling(2).sum()
+
+
+Using the time-specification generates variable windows for this sparse data.
+
+.. ipython:: python
+
+ dft.rolling("2s").sum()
+
+Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the
+default of the index) in a DataFrame.
+
+.. ipython:: python
+
+ dft = dft.reset_index()
+ dft
+ dft.rolling("2s", on="foo").sum()
+
+.. _stats.custom_rolling_window:
+
+Custom window rolling
+~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.0
+
+In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts
+a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds.
+The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns
+a tuple of two arrays, the first being the starting indices of the windows and second being the
+ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed``
+and will automatically be passed to ``get_window_bounds`` and the defined method must
+always accept these arguments.
+
+For example, if we have the following ``DataFrame``:
+
+.. ipython:: python
+
+ use_expanding = [True, False, True, False, True]
+ use_expanding
+ df = pd.DataFrame({"values": range(5)})
+ df
+
+and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size
+1, we can create the following ``BaseIndexer`` subclass:
+
+.. code-block:: ipython
+
+ In [2]: from pandas.api.indexers import BaseIndexer
+ ...:
+ ...: class CustomIndexer(BaseIndexer):
+ ...:
+ ...: def get_window_bounds(self, num_values, min_periods, center, closed):
+ ...: start = np.empty(num_values, dtype=np.int64)
+ ...: end = np.empty(num_values, dtype=np.int64)
+ ...: for i in range(num_values):
+ ...: if self.use_expanding[i]:
+ ...: start[i] = 0
+ ...: end[i] = i + 1
+ ...: else:
+ ...: start[i] = i
+ ...: end[i] = i + self.window_size
+ ...: return start, end
+ ...:
+
+ In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+ In [4]: df.rolling(indexer).sum()
+ Out[4]:
+ values
+ 0 0.0
+ 1 1.0
+ 2 3.0
+ 3 3.0
+ 4 10.0
+
+You can view other examples of ``BaseIndexer`` subclasses `here `__
+
+.. versionadded:: 1.1
+
+One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows
+rolling operations over a non-fixed offset like a ``BusinessDay``.
+
+.. ipython:: python
+
+ from pandas.api.indexers import VariableOffsetWindowIndexer
+
+ df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10))
+ offset = pd.offsets.BDay(1)
+ indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset)
+ df
+ df.rolling(indexer).sum()
+
+For some problems knowledge of the future is available for analysis. For example, this occurs when
+each data point is a full time series read from an experiment, and the task is to extract underlying
+conditions. In these cases it can be useful to perform forward-looking rolling window computations.
+:func:`FixedForwardWindowIndexer ` class is available for this purpose.
+This :func:`BaseIndexer ` subclass implements a closed fixed-width
+forward-looking rolling window, and we can use it as follows:
+
+.. ipython:: ipython
+
+ from pandas.api.indexers import FixedForwardWindowIndexer
+ indexer = FixedForwardWindowIndexer(window_size=2)
+ df.rolling(indexer, min_periods=1).sum()
+
+.. _stats.rolling_window.endpoints:
+
+Rolling window endpoints
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed``
+parameter:
+
+.. csv-table::
+ :header: "``closed``", "Description", "Default for"
+ :widths: 20, 30, 30
+
+ ``right``, close right endpoint,
+ ``left``, close left endpoint,
+ ``both``, close both endpoints,
+ ``neither``, open endpoints,
+
+For example, having the right endpoint open is useful in many problems that require that there is no contamination
+from present information back to past information. This allows the rolling window to compute statistics
+"up to that point in time", but not including that point in time.
+
+.. ipython:: python
+
+ df = pd.DataFrame(
+ {"x": 1},
+ index=[
+ pd.Timestamp("20130101 09:00:01"),
+ pd.Timestamp("20130101 09:00:02"),
+ pd.Timestamp("20130101 09:00:03"),
+ pd.Timestamp("20130101 09:00:04"),
+ pd.Timestamp("20130101 09:00:06"),
+ ],
+ )
+
+ df["right"] = df.rolling("2s", closed="right").x.sum() # default
+ df["both"] = df.rolling("2s", closed="both").x.sum()
+ df["left"] = df.rolling("2s", closed="left").x.sum()
+ df["neither"] = df.rolling("2s", closed="neither").x.sum()
+
+ df
+
+.. _stats.iter_rolling_window:
+
+Iteration over window:
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.1.0
+
+``Rolling`` and ``Expanding`` objects now support iteration. Be noted that ``min_periods`` is ignored in iteration.
+
+.. ipython::
+
+ In [1]: df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+ In [2]: for i in df.rolling(2):
+ ...: print(i)
+ ...:
+
+
+.. _stats.moments.ts-versus-resampling:
+
+Time-aware rolling vs. resampling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They
+both operate and perform reductive operations on time-indexed pandas objects.
+
+When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and
+aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value
+at that point in the result. These are variable sized windows in time-space for each point of the input. You will get
+a same sized result as the input.
+
+When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency
+bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this
+aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result
+will have the shape of a regular frequency between the min and the max of the original input object.
+
+To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation.
+
+Centering windows
+~~~~~~~~~~~~~~~~~
+
+By default the labels are set to the right edge of the window, but a
+``center`` keyword is available so the labels can be set at the center.
+
+.. ipython:: python
+
+ ser.rolling(window=5).mean()
+ ser.rolling(window=5, center=True).mean()
+
+.. _stats.moments.binary:
+
+Binary window functions
+~~~~~~~~~~~~~~~~~~~~~~~
+
+:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about
+two ``Series`` or any combination of ``DataFrame/Series`` or
+``DataFrame/DataFrame``. Here is the behavior in each case:
+
+* two ``Series``: compute the statistic for the pairing.
+* ``DataFrame/Series``: compute the statistics for each column of the DataFrame
+ with the passed Series, thus returning a DataFrame.
+* ``DataFrame/DataFrame``: by default compute the statistic for matching column
+ names, returning a DataFrame. If the keyword argument ``pairwise=True`` is
+ passed then computes the statistic for each pair of columns, returning a
+ ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section
+ `).
+
+For example:
+
+.. ipython:: python
+
+ df = pd.DataFrame(
+ np.random.randn(1000, 4),
+ index=pd.date_range("1/1/2000", periods=1000),
+ columns=["A", "B", "C", "D"],
+ )
+ df = df.cumsum()
+
+ df2 = df[:20]
+ df2.rolling(window=5).corr(df2["B"])
+
+.. _stats.moments.corr_pairwise:
+
+Computing rolling pairwise covariances and correlations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In financial data analysis and other fields it's common to compute covariance
+and correlation matrices for a collection of time series. Often one is also
+interested in moving-window covariance and correlation matrices. This can be
+done by passing the ``pairwise`` keyword argument, which in the case of
+``DataFrame`` inputs will yield a MultiIndexed ``DataFrame`` whose ``index`` are the dates in
+question. In the case of a single DataFrame argument the ``pairwise`` argument
+can even be omitted:
+
+.. note::
+
+ Missing values are ignored and each entry is computed using the pairwise
+ complete observations. Please see the :ref:`covariance section
+ ` for :ref:`caveats
+ ` associated with this method of
+ calculating covariance and correlation matrices.
+
+.. ipython:: python
+
+ covs = (
+ df[["B", "C", "D"]]
+ .rolling(window=50)
+ .cov(df[["A", "B", "C"]], pairwise=True)
+ )
+ covs.loc["2002-09-22":]
+
+.. ipython:: python
+
+ correls = df.rolling(window=50).corr()
+ correls.loc["2002-09-22":]
+
+You can efficiently retrieve the time series of correlations between two
+columns by reshaping and indexing:
+
+.. ipython:: python
+ :suppress:
+
+ plt.close("all")
+
+.. ipython:: python
+
+ @savefig rolling_corr_pairwise_ex.png
+ correls.unstack(1)[("A", "C")].plot()
+
+.. _stats.aggregate:
+
+Aggregation
+-----------
+
+Once the ``Rolling``, ``Expanding`` or ``ExponentialMovingWindow`` objects have been created, several methods are available to
+perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `,
+:ref:`groupby API `, and :ref:`resample API `.
+
+
+.. ipython:: python
+
+ dfa = pd.DataFrame(
+ np.random.randn(1000, 3),
+ index=pd.date_range("1/1/2000", periods=1000),
+ columns=["A", "B", "C"],
+ )
+ r = dfa.rolling(window=60, min_periods=1)
+ r
+
+We can aggregate by passing a function to the entire DataFrame, or select a
+Series (or multiple Series) via standard ``__getitem__``.
+
+.. ipython:: python
+
+ r.aggregate(np.sum)
+
+ r["A"].aggregate(np.sum)
+
+ r[["A", "B"]].aggregate(np.sum)
+
+As you can see, the result of the aggregation will have the selected columns, or all
+columns if none are selected.
+
+.. _stats.aggregate.multifunc:
+
+Applying multiple functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With windowed ``Series`` you can also pass a list of functions to do
+aggregation with, outputting a DataFrame:
+
+.. ipython:: python
+
+ r["A"].agg([np.sum, np.mean, np.std])
+
+On a windowed DataFrame, you can pass a list of functions to apply to each
+column, which produces an aggregated result with a hierarchical index:
+
+.. ipython:: python
+
+ r.agg([np.sum, np.mean])
+
+Passing a dict of functions has different behavior by default, see the next
+section.
+
+Applying different functions to DataFrame columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By passing a dict to ``aggregate`` you can apply a different aggregation to the
+columns of a ``DataFrame``:
+
+.. ipython:: python
+
+ r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
+
+The function names can also be strings. In order for a string to be valid it
+must be implemented on the windowed object
+
+.. ipython:: python
+
+ r.agg({"A": "sum", "B": "std"})
+
+Furthermore you can pass a nested dict to indicate different aggregations on different columns.
+
+.. ipython:: python
+
+ r.agg({"A": ["sum", "std"], "B": ["mean", "std"]})
+
+
+.. _stats.moments.expanding:
+
+Expanding windows
+-----------------
+
+A common alternative to rolling statistics is to use an *expanding* window,
+which yields the value of the statistic with all the data available up to that
+point in time.
+
+These follow a similar interface to ``.rolling``, with the ``.expanding`` method
+returning an :class:`~pandas.core.window.Expanding` object.
+
+As these calculations are a special case of rolling statistics,
+they are implemented in pandas such that the following two calls are equivalent:
+
+.. ipython:: python
+
+ df.rolling(window=len(df), min_periods=1).mean()[:5]
+
+ df.expanding(min_periods=1).mean()[:5]
+
+These have a similar set of methods to ``.rolling`` methods.
+
+Method summary
+~~~~~~~~~~~~~~
+
+.. currentmodule:: pandas.core.window
+
+.. csv-table::
+ :header: "Function", "Description"
+ :widths: 20, 80
+
+ :meth:`~Expanding.count`, Number of non-null observations
+ :meth:`~Expanding.sum`, Sum of values
+ :meth:`~Expanding.mean`, Mean of values
+ :meth:`~Expanding.median`, Arithmetic median of values
+ :meth:`~Expanding.min`, Minimum
+ :meth:`~Expanding.max`, Maximum
+ :meth:`~Expanding.std`, Sample standard deviation
+ :meth:`~Expanding.var`, Sample variance
+ :meth:`~Expanding.skew`, Sample skewness (3rd moment)
+ :meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
+ :meth:`~Expanding.quantile`, Sample quantile (value at %)
+ :meth:`~Expanding.apply`, Generic apply
+ :meth:`~Expanding.cov`, Sample covariance (binary)
+ :meth:`~Expanding.corr`, Sample correlation (binary)
+ :meth:`~Expanding.sem`, Standard error of mean
+
+.. note::
+
+ Using sample variance formulas for :meth:`~Expanding.std` and
+ :meth:`~Expanding.var` comes with the same caveats as using them with rolling
+ windows. See :ref:`this section ` for more
+ information.
+
+ The same caveats apply to using any supported statistical sample methods.
+
+.. currentmodule:: pandas
+
+Aside from not having a ``window`` parameter, these functions have the same
+interfaces as their ``.rolling`` counterparts. Like above, the parameters they
+all accept are:
+
+* ``min_periods``: threshold of non-null data points to require. Defaults to
+ minimum needed to compute statistic. No ``NaNs`` will be output once
+ ``min_periods`` non-null data points have been seen.
+* ``center``: boolean, whether to set the labels at the center (default is False).
+
+.. _stats.moments.expanding.note:
+.. note::
+
+ The output of the ``.rolling`` and ``.expanding`` methods do not return a
+ ``NaN`` if there are at least ``min_periods`` non-null values in the current
+ window. For example:
+
+ .. ipython:: python
+
+ sn = pd.Series([1, 2, np.nan, 3, np.nan, 4])
+ sn
+ sn.rolling(2).max()
+ sn.rolling(2, min_periods=1).max()
+
+ In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`,
+ :meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`,
+ and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever
+ a ``NaN`` is encountered in the input. In order to match the output of ``cumsum``
+ with ``expanding``, use :meth:`~DataFrame.fillna`:
+
+ .. ipython:: python
+
+ sn.expanding().sum()
+ sn.cumsum()
+ sn.cumsum().fillna(method="ffill")
+
+
+An expanding window statistic will be more stable (and less responsive) than
+its rolling window counterpart as the increasing window size decreases the
+relative impact of an individual data point. As an example, here is the
+:meth:`~core.window.Expanding.mean` output for the previous time series dataset:
+
+.. ipython:: python
+ :suppress:
+
+ plt.close("all")
+
+.. ipython:: python
+
+ s.plot(style="k--")
+
+ @savefig expanding_mean_frame.png
+ s.expanding().mean().plot(style="k")
+
+
+.. _stats.moments.exponentially_weighted:
+
+Exponentially weighted windows
+------------------------------
+
+.. currentmodule:: pandas.core.window
+
+A related set of functions are exponentially weighted versions of several of
+the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed
+through the ``.ewm`` method to receive an :class:`~ExponentialMovingWindow` object.
+A number of expanding EW (exponentially weighted)
+methods are provided:
+
+
+.. csv-table::
+ :header: "Function", "Description"
+ :widths: 20, 80
+
+ :meth:`~ExponentialMovingWindow.mean`, EW moving average
+ :meth:`~ExponentialMovingWindow.var`, EW moving variance
+ :meth:`~ExponentialMovingWindow.std`, EW moving standard deviation
+ :meth:`~ExponentialMovingWindow.corr`, EW moving correlation
+ :meth:`~ExponentialMovingWindow.cov`, EW moving covariance
+
+In general, a weighted moving average is calculated as
+
+.. math::
+
+ y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i},
+
+where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i`
+are the weights.
+
+The EW functions support two variants of exponential weights.
+The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i`
+which gives
+
+.. math::
+
+ y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...
+ + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
+ + (1 - \alpha)^t}
+
+When ``adjust=False`` is specified, moving averages are calculated as
+
+.. math::
+
+ y_0 &= x_0 \\
+ y_t &= (1 - \alpha) y_{t-1} + \alpha x_t,
+
+which is equivalent to using weights
+
+.. math::
+
+ w_i = \begin{cases}
+ \alpha (1 - \alpha)^i & \text{if } i < t \\
+ (1 - \alpha)^i & \text{if } i = t.
+ \end{cases}
+
+.. note::
+
+ These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g.
+
+ .. math::
+
+ y_t = \alpha' y_{t-1} + (1 - \alpha') x_t.
+
+The difference between the above two variants arises because we are
+dealing with series which have finite history. Consider a series of infinite
+history, with ``adjust=True``:
+
+.. math::
+
+ y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...}
+ {1 + (1 - \alpha) + (1 - \alpha)^2 + ...}
+
+Noting that the denominator is a geometric series with initial term equal to 1
+and a ratio of :math:`1 - \alpha` we have
+
+.. math::
+
+ y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...}
+ {\frac{1}{1 - (1 - \alpha)}}\\
+ &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\
+ &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\
+ &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\
+ &= \alpha x_t + (1 - \alpha) y_{t-1}
+
+which is the same expression as ``adjust=False`` above and therefore
+shows the equivalence of the two variants for infinite series.
+When ``adjust=False``, we have :math:`y_0 = x_0` and
+:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`.
+Therefore, there is an assumption that :math:`x_0` is not an ordinary value
+but rather an exponentially weighted moment of the infinite series up to that
+point.
+
+One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass
+:math:`\alpha` directly, it's often easier to think about either the
+**span**, **center of mass (com)** or **half-life** of an EW moment:
+
+.. math::
+
+ \alpha =
+ \begin{cases}
+ \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\
+ \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\
+ 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0
+ \end{cases}
+
+One must specify precisely one of **span**, **center of mass**, **half-life**
+and **alpha** to the EW functions:
+
+* **Span** corresponds to what is commonly called an "N-day EW moving average".
+* **Center of mass** has a more physical interpretation and can be thought of
+ in terms of span: :math:`c = (s - 1) / 2`.
+* **Half-life** is the period of time for the exponential weight to reduce to
+ one half.
+* **Alpha** specifies the smoothing factor directly.
+
+.. versionadded:: 1.1.0
+
+You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of
+time it takes for an observation to decay to half its value when also specifying a sequence
+of ``times``.
+
+.. ipython:: python
+
+ df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
+ df
+ times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"]
+ df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean()
+
+The following formula is used to compute exponentially weighted mean with an input vector of times:
+
+.. math::
+
+ y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}},
+
+Here is an example for a univariate time series:
+
+.. ipython:: python
+
+ s.plot(style="k--")
+
+ @savefig ewma_ex.png
+ s.ewm(span=20).mean().plot(style="k")
+
+ExponentialMovingWindow has a ``min_periods`` argument, which has the same
+meaning it does for all the ``.expanding`` and ``.rolling`` methods:
+no output values will be set until at least ``min_periods`` non-null values
+are encountered in the (expanding) window.
+
+ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how
+intermediate null values affect the calculation of the weights.
+When ``ignore_na=False`` (the default), weights are calculated based on absolute
+positions, so that intermediate null values affect the result.
+When ``ignore_na=True``,
+weights are calculated by ignoring intermediate null values.
+For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted
+average of ``3, NaN, 5`` would be calculated as
+
+.. math::
+
+ \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}.
+
+Whereas if ``ignore_na=True``, the weighted average would be calculated as
+
+.. math::
+
+ \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}.
+
+The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument,
+specifying whether the result should contain biased or unbiased statistics.
+For example, if ``bias=True``, ``ewmvar(x)`` is calculated as
+``ewmvar(x) = ewma(x**2) - ewma(x)**2``;
+whereas if ``bias=False`` (the default), the biased variance statistics
+are scaled by debiasing factors
+
+.. math::
+
+ \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}.
+
+(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor,
+with :math:`N = t + 1`.)
+See `Weighted Sample Variance `__
+on Wikipedia for further details.
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 5a6f56388dee5..939acf10d6c0b 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -18,6 +18,9 @@ above what the in-line examples offer.
pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept
explicitly imported for newer users.
+These examples are written for Python 3. Minor tweaks might be necessary for earlier python
+versions.
+
Idioms
------
@@ -68,7 +71,7 @@ Or use pandas where after you've set up a mask
)
df.where(df_mask, -1000)
-`if-then-else using NumPy's where()
+`if-then-else using numpy's where()
`__
.. ipython:: python
@@ -1010,7 +1013,7 @@ The :ref:`Plotting ` docs.
`Setting x-axis major and minor labels
`__
-`Plotting multiple charts in an IPython Jupyter notebook
+`Plotting multiple charts in an ipython notebook
`__
`Creating a multi-line plot
diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
index f2bb99dd2ebc0..905877cca61db 100644
--- a/doc/source/user_guide/dsintro.rst
+++ b/doc/source/user_guide/dsintro.rst
@@ -439,7 +439,7 @@ Data Classes as introduced in `PEP557
can be passed into the DataFrame constructor.
Passing a list of dataclasses is equivalent to passing a list of dictionaries.
-Please be aware, that all values in the list should be dataclasses, mixing
+Please be aware, that that all values in the list should be dataclasses, mixing
types in the list would result in a TypeError.
.. ipython:: python
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
index 42621c032416d..cc8de98165fac 100644
--- a/doc/source/user_guide/enhancingperf.rst
+++ b/doc/source/user_guide/enhancingperf.rst
@@ -96,7 +96,7 @@ hence we'll concentrate our efforts cythonizing these two functions.
Plain Cython
~~~~~~~~~~~~
-First we're going to need to import the Cython magic function to IPython:
+First we're going to need to import the Cython magic function to ipython:
.. ipython:: python
:okwarning:
@@ -123,7 +123,7 @@ is here to distinguish between function versions):
.. note::
If you're having trouble pasting the above into your ipython, you may need
- to be using bleeding edge IPython for paste to play well with cell magics.
+ to be using bleeding edge ipython for paste to play well with cell magics.
.. code-block:: ipython
@@ -160,7 +160,7 @@ We get another huge improvement simply by providing type information:
In [4]: %timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)
10 loops, best of 3: 20.3 ms per loop
-Now, we're talking! It's now over ten times faster than the original Python
+Now, we're talking! It's now over ten times faster than the original python
implementation, and we haven't *really* modified the code. Let's have another
look at what's eating up time:
@@ -375,7 +375,7 @@ Numba as an argument
Additionally, we can leverage the power of `Numba `__
by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools
-` for an extensive example.
+` for an extensive example.
Vectorize
~~~~~~~~~
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index d6081155b58db..e8866daa9d99f 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -478,7 +478,7 @@ Aggregation
Once the GroupBy object has been created, several methods are available to
perform a computation on the grouped data. These operations are similar to the
-:ref:`aggregating API `, :ref:`window API `,
+:ref:`aggregating API `, :ref:`window functions API `,
and :ref:`resample API `.
An obvious one is aggregation via the
@@ -524,15 +524,6 @@ index are the group names and whose values are the sizes of each group.
grouped.describe()
-Another aggregation example is to compute the number of unique values of each group. This is similar to the ``value_counts`` function, except that it only counts unique values.
-
-.. ipython:: python
-
- ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]]
- df4 = pd.DataFrame(ll, columns=["A", "B"])
- df4
- df4.groupby("A")["B"].nunique()
-
.. note::
Aggregation functions **will not** return the groups that you are aggregating over
@@ -681,7 +672,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation",
)
-If your desired output column names are not valid Python keywords, construct a dictionary
+If your desired output column names are not valid python keywords, construct a dictionary
and unpack the keyword arguments
.. ipython:: python
@@ -1099,7 +1090,7 @@ will be passed into ``values``, and the group index will be passed into ``index`
.. warning::
When using ``engine='numba'``, there will be no "fall back" behavior internally. The group
- data and group index will be passed as NumPy arrays to the JITed user defined function, and no
+ data and group index will be passed as numpy arrays to the JITed user defined function, and no
alternative execution attempts will be tried.
.. note::
diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst
index 901f42097b911..2fc9e066e6712 100644
--- a/doc/source/user_guide/index.rst
+++ b/doc/source/user_guide/index.rst
@@ -40,7 +40,6 @@ Further information on any specific method can be obtained in the
visualization
computation
groupby
- window
timeseries
timedeltas
style
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
index 817ea3445f995..98c981539d207 100644
--- a/doc/source/user_guide/indexing.rst
+++ b/doc/source/user_guide/indexing.rst
@@ -55,7 +55,7 @@ of multi-axis indexing.
*label* of the index. This use is **not** an integer position along the
index.).
* A list or array of labels ``['a', 'b', 'c']``.
- * A slice object with labels ``'a':'f'`` (Note that contrary to usual Python
+ * A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
index! See :ref:`Slicing with labels `
and :ref:`Endpoints are inclusive `.)
@@ -327,7 +327,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp
* A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.).
* A list or array of labels ``['a', 'b', 'c']``.
-* A slice object with labels ``'a':'f'`` (Note that contrary to usual Python
+* A slice object with labels ``'a':'f'`` (Note that contrary to usual python
slices, **both** the start and the stop are included, when present in the
index! See :ref:`Slicing with labels `.
* A boolean array.
@@ -509,11 +509,11 @@ For getting a cross section using an integer position (equiv to ``df.xs(1)``):
df1.iloc[1]
-Out of range slice indexes are handled gracefully just as in Python/NumPy.
+Out of range slice indexes are handled gracefully just as in Python/Numpy.
.. ipython:: python
- # these are allowed in Python/NumPy.
+ # these are allowed in python/numpy.
x = list('abcdef')
x
x[4:10]
@@ -584,20 +584,48 @@ without using a temporary variable.
(bb.groupby(['year', 'team']).sum()
.loc[lambda df: df['r'] > 100])
+.. _indexing.deprecate_ix:
-.. _combining_positional_and_label_based_indexing:
+IX indexer is deprecated
+------------------------
+
+.. warning::
+
+ .. versionchanged:: 1.0.0
+
+ The ``.ix`` indexer was removed, in favor of the more strict ``.iloc`` and ``.loc`` indexers.
-Combining positional and label-based indexing
----------------------------------------------
+``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide
+to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a
+bit of user confusion over the years.
-If you wish to get the 0th and the 2nd elements from the index in the 'A' column, you can do:
+The recommended methods of indexing are:
+
+* ``.loc`` if you want to *label* index.
+* ``.iloc`` if you want to *positionally* index.
.. ipython:: python
dfd = pd.DataFrame({'A': [1, 2, 3],
'B': [4, 5, 6]},
index=list('abc'))
+
dfd
+
+Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column.
+
+.. code-block:: ipython
+
+ In [3]: dfd.ix[[0, 2], 'A']
+ Out[3]:
+ a 1
+ c 3
+ Name: A, dtype: int64
+
+Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing.
+
+.. ipython:: python
+
dfd.loc[dfd.index[[0, 2]], 'A']
This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using
@@ -1130,40 +1158,6 @@ Mask
s.mask(s >= 0)
df.mask(df >= 0)
-.. _indexing.np_where:
-
-Setting with enlargement conditionally using :func:`numpy`
-----------------------------------------------------------
-
-An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`.
-Combined with setting a new column, you can use it to enlarge a dataframe where the
-values are determined conditionally.
-
-Consider you have two choices to choose from in the following dataframe. And you want to
-set a new column color to 'green' when the second column has 'Z'. You can do the
-following:
-
-.. ipython:: python
-
- df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
- df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
- df
-
-If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say
-corresponding to three conditions there are three choice of colors, with a fourth color
-as a fallback, you can do the following.
-
-.. ipython:: python
-
- conditions = [
- (df['col2'] == 'Z') & (df['col1'] == 'A'),
- (df['col2'] == 'Z') & (df['col1'] == 'B'),
- (df['col1'] == 'B')
- ]
- choices = ['yellow', 'blue', 'purple']
- df['color'] = np.select(conditions, choices, default='black')
- df
-
.. _indexing.query:
The :meth:`~pandas.DataFrame.query` Method
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
index 2d5673fe53be3..be38736f493b5 100644
--- a/doc/source/user_guide/integer_na.rst
+++ b/doc/source/user_guide/integer_na.rst
@@ -117,7 +117,7 @@ dtype if needed.
# coerce when needed
s + 0.01
-These dtypes can operate as part of ``DataFrame``.
+These dtypes can operate as part of of ``DataFrame``.
.. ipython:: python
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 1bd35131622ab..1c271e74aafba 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1024,10 +1024,9 @@ Writing CSVs to binary file objects
.. versionadded:: 1.2.0
-``df.to_csv(..., mode="wb")`` allows writing a CSV to a file object
-opened binary mode. In most cases, it is not necessary to specify
-``mode`` as Pandas will auto-detect whether the file object is
-opened in text or binary mode.
+``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object
+opened binary mode. For this to work, it is necessary that ``mode``
+contains a "b":
.. ipython:: python
@@ -1035,7 +1034,7 @@ opened in text or binary mode.
data = pd.DataFrame([0, 1, 2])
buffer = io.BytesIO()
- data.to_csv(buffer, encoding="utf-8", compression="gzip")
+ data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
.. _io.float_precision:
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index d8998a9a0a6e1..f1a28dc30dd68 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -194,7 +194,7 @@ behavior:
},
index=[2, 3, 6, 7],
)
- result = pd.concat([df1, df4], axis=1)
+ result = pd.concat([df1, df4], axis=1, sort=False)
.. ipython:: python
@@ -204,6 +204,13 @@ behavior:
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
plt.close("all");
+.. warning::
+
+ The default behavior with ``join='outer'`` is to sort the other axis
+ (columns in this case). In a future version of pandas, the default will
+ be to not sort. We specified ``sort=False`` to opt in to the new
+ behavior now.
+
Here is the same thing with ``join='inner'``:
.. ipython:: python
diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst
index c828bc28826b1..d222297abc70b 100644
--- a/doc/source/user_guide/options.rst
+++ b/doc/source/user_guide/options.rst
@@ -124,13 +124,13 @@ are restored automatically when you exit the ``with`` block:
Setting startup options in Python/IPython environment
-----------------------------------------------------
-Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default IPython profile can be found at:
+Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at:
.. code-block:: none
$IPYTHONDIR/profile_default/startup
-More information can be found in the `IPython documentation
+More information can be found in the `ipython documentation
`__. An example startup script for pandas is displayed below:
.. code-block:: python
@@ -332,7 +332,7 @@ display.large_repr truncate For DataFrames exceeding ma
(the behaviour in earlier versions of pandas).
allowable settings, ['truncate', 'info']
display.latex.repr False Whether to produce a latex DataFrame
- representation for Jupyter frontends
+ representation for jupyter frontends
that support it.
display.latex.escape True Escapes special characters in DataFrames, when
using the to_latex method.
@@ -413,7 +413,7 @@ display.show_dimensions truncate Whether to print out dimens
frame is truncated (e.g. not display
all rows and/or columns)
display.width 80 Width of the display in characters.
- In case Python/IPython is running in
+ In case python/IPython is running in
a terminal this can be set to None
and pandas will correctly auto-detect
the width. Note that the IPython notebook,
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
index e4eea57c43dbb..3156e3088d860 100644
--- a/doc/source/user_guide/sparse.rst
+++ b/doc/source/user_guide/sparse.rst
@@ -179,7 +179,7 @@ sparse values instead.
rather than a SparseSeries or SparseDataFrame.
This section provides some guidance on migrating your code to the new style. As a reminder,
-you can use the Python warnings module to control warnings. But we recommend modifying
+you can use the python warnings module to control warnings. But we recommend modifying
your code, rather than ignoring the warning.
**Construction**
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index 24f344488d1ca..12dd72f761408 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -793,8 +793,7 @@
"source": [
"The next option you have are \"table styles\".\n",
"These are styles that apply to the table as a whole, but don't look at the data.\n",
- "Certain stylings, including pseudo-selectors like `:hover` can only be used this way.\n",
- "These can also be used to set specific row or column based class selectors, as will be shown."
+ "Certain stylings, including pseudo-selectors like `:hover` can only be used this way."
]
},
{
@@ -832,32 +831,9 @@
"The value for `props` should be a list of tuples of `('attribute', 'value')`.\n",
"\n",
"`table_styles` are extremely flexible, but not as fun to type out by hand.\n",
- "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here.\n",
- "\n",
- "`table_styles` can be used to add column and row based class descriptors. For large tables this can increase performance by avoiding repetitive individual css for each cell, and it can also simplify style construction in some cases.\n",
- "If `table_styles` is given as a dictionary each key should be a specified column or index value and this will map to specific class CSS selectors of the given column or row.\n",
- "\n",
- "Note that `Styler.set_table_styles` will overwrite existing styles but can be chained by setting the `overwrite` argument to `False`."
+ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here."
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "html = html.set_table_styles({\n",
- " 'B': [dict(selector='', props=[('color', 'green')])],\n",
- " 'C': [dict(selector='td', props=[('color', 'red')])], \n",
- " }, overwrite=False)\n",
- "html"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -946,12 +922,10 @@
"- DataFrame only `(use Series.to_frame().style)`\n",
"- The index and columns must be unique\n",
"- No large repr, and performance isn't great; this is intended for summary DataFrames\n",
- "- You can only style the *values*, not the index or columns (except with `table_styles` above)\n",
+ "- You can only style the *values*, not the index or columns\n",
"- You can only apply styles, you can't insert new HTML entities\n",
"\n",
- "Some of these will be addressed in the future.\n",
- "Performance can suffer when adding styles to each cell in a large DataFrame.\n",
- "It is recommended to apply table or column based styles where possible to limit overall HTML length, as well as setting a shorter UUID to avoid unnecessary repeated data transmission. \n"
+ "Some of these will be addressed in the future.\n"
]
},
{
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index bee72ec70d95e..8044172bc4c4a 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -588,12 +588,10 @@ would include matching times on an included date:
.. warning::
- Indexing ``DataFrame`` rows with a *single* string with getitem (e.g. ``frame[dtstring]``)
- is deprecated starting with pandas 1.2.0 (given the ambiguity whether it is indexing
- the rows or selecting a column) and will be removed in a future version. The equivalent
- with ``.loc`` (e.g. ``frame.loc[dtstring]``) is still supported.
+ Indexing ``DataFrame`` rows with strings is deprecated in pandas 1.2.0 and will be removed in a future version. Use ``frame.loc[dtstring]`` instead.
.. ipython:: python
+ :okwarning:
dft = pd.DataFrame(
np.random.randn(100000, 1),
@@ -601,30 +599,34 @@ would include matching times on an included date:
index=pd.date_range("20130101", periods=100000, freq="T"),
)
dft
- dft.loc["2013"]
+ dft["2013"]
This starts on the very first time in the month, and includes the last date and
time for the month:
.. ipython:: python
+ :okwarning:
dft["2013-1":"2013-2"]
This specifies a stop time **that includes all of the times on the last day**:
.. ipython:: python
+ :okwarning:
dft["2013-1":"2013-2-28"]
This specifies an **exact** stop time (and is not the same as the above):
.. ipython:: python
+ :okwarning:
dft["2013-1":"2013-2-28 00:00:00"]
We are stopping on the included end-point as it is part of the index:
.. ipython:: python
+ :okwarning:
dft["2013-1-15":"2013-1-15 12:30:00"]
@@ -650,6 +652,7 @@ We are stopping on the included end-point as it is part of the index:
Slicing with string indexing also honors UTC offset.
.. ipython:: python
+ :okwarning:
df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific"))
df
@@ -701,14 +704,15 @@ If index resolution is second, then the minute-accurate timestamp gives a
series_second.index.resolution
series_second["2011-12-31 23:59"]
-If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``.loc[]`` as well.
+If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well.
.. ipython:: python
+ :okwarning:
dft_minute = pd.DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index
)
- dft_minute.loc["2011-12-31 23"]
+ dft_minute["2011-12-31 23"]
.. warning::
@@ -1576,6 +1580,11 @@ some advanced strategies.
The ``resample()`` method can be used directly from ``DataFrameGroupBy`` objects,
see the :ref:`groupby docs `.
+.. note::
+
+ ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with
+ a time-based offset, see a discussion :ref:`here `.
+
Basics
~~~~~~
@@ -1721,7 +1730,7 @@ We can instead only resample those groups where we have points as follows:
Aggregation
~~~~~~~~~~~
-Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window API `,
+Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window functions API `,
a ``Resampler`` can be selectively resampled.
Resampling a ``DataFrame``, the default will be to act on all columns with the same function.
@@ -2120,6 +2129,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI
Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data.
.. ipython:: python
+ :okwarning:
ps["2011"]
@@ -2129,7 +2139,7 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par
index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"),
)
dfp
- dfp.loc["2013-01-01 10H"]
+ dfp["2013-01-01 10H"]
As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59.
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
deleted file mode 100644
index 05f8be091fa25..0000000000000
--- a/doc/source/user_guide/window.rst
+++ /dev/null
@@ -1,593 +0,0 @@
-.. _window:
-
-{{ header }}
-
-********************
-Windowing Operations
-********************
-
-pandas contains a compact set of APIs for performing windowing operations - an operation that performs
-an aggregation over a sliding partition of values. The API functions similarly to the ``groupby`` API
-in that :class:`Series` and :class:`DataFrame` call the windowing method with
-necessary parameters and then subsequently call the aggregation function.
-
-.. ipython:: python
-
- s = pd.Series(range(5))
- s.rolling(window=2).sum()
-
-The windows are comprised by looking back the length of the window from the current observation.
-The result above can be derived by taking the sum of the following windowed partitions of data:
-
-.. ipython:: python
-
- for window in s.rolling(window=2):
- print(window)
-
-
-.. _window.overview:
-
-Overview
---------
-
-pandas supports 4 types of windowing operations:
-
-#. Rolling window: Generic fixed or variable sliding window over the values.
-#. Weighted window: Weighted, non-rectangular window supplied by the ``scipy.signal`` library.
-#. Expanding window: Accumulating window over the values.
-#. Exponentially Weighted window: Accumulating and exponentially weighted window over the values.
-
-============================= ================= =========================== =========================== ========================
-Concept Method Returned Object Supports time-based windows Supports chained groupby
-============================= ================= =========================== =========================== ========================
-Rolling window ``rolling`` ``Rolling`` Yes Yes
-Weighted window ``rolling`` ``Window`` No No
-Expanding window ``expanding`` ``Expanding`` No Yes
-Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2)
-============================= ================= =========================== =========================== ========================
-
-As noted above, some operations support specifying a window based on a time offset:
-
-.. ipython:: python
-
- s = pd.Series(range(5), index=pd.date_range('2020-01-01', periods=5, freq='1D'))
- s.rolling(window='2D').sum()
-
-Additionally, some methods support chaining a ``groupby`` operation with a windowing operation
-which will first group the data by the specified keys and then perform a windowing operation per group.
-
-.. ipython:: python
-
- df = pd.DataFrame({'A': ['a', 'b', 'a', 'b', 'a'], 'B': range(5)})
- df.groupby('A').expanding().sum()
-
-.. note::
-
- Windowing operations currently only support numeric data (integer and float)
- and will always return ``float64`` values.
-
-.. warning::
-
- Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical
- imprecision due to the underlying windowing algorithms accumulating sums. When values differ
- with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be
- noted, that large values may have an impact on windows, which do not include these values. `Kahan summation
- `__ is used
- to compute the rolling sums to preserve accuracy as much as possible.
-
-
-All windowing operations support a ``min_periods`` argument that dictates the minimum amount of
-non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``.
-``min_peridos`` defaults to 1 for time-based windows and ``window`` for fixed windows
-
-.. ipython:: python
-
- s = pd.Series([np.nan, 1, 2, np.nan, np.nan, 3])
- s.rolling(window=3, min_periods=1).sum()
- s.rolling(window=3, min_periods=2).sum()
- # Equivalent to min_periods=3
- s.rolling(window=3, min_periods=None).sum()
-
-
-Additionally, all windowing operations supports the ``aggregate`` method for returning a result
-of multiple aggregations applied to a window.
-
-.. ipython:: python
-
- df = pd.DataFrame({"A": range(5), "B": range(10, 15)})
- df.expanding().agg([np.sum, np.mean, np.std])
-
-
-.. _window.generic:
-
-Rolling window
---------------
-
-Generic rolling windows support specifying windows as a fixed number of observations or variable
-number of observations based on an offset. If a time based offset is provided, the corresponding
-time based index must be monotonic.
-
-.. ipython:: python
-
- times = ['2020-01-01', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-29']
- s = pd.Series(range(5), index=pd.DatetimeIndex(times))
- s
- # Window with 2 observations
- s.rolling(window=2).sum()
- # Window with 2 days worth of observations
- s.rolling(window='2D').sum()
-
-For all supported aggregation functions, see :ref:`api.functions_rolling`.
-
-.. _window.center:
-
-Centering windows
-~~~~~~~~~~~~~~~~~
-
-By default the labels are set to the right edge of the window, but a
-``center`` keyword is available so the labels can be set at the center.
-
-.. ipython:: python
-
- s = pd.Series(range(10))
- s.rolling(window=5).mean()
- s.rolling(window=5, center=True).mean()
-
-
-.. _window.endpoints:
-
-Rolling window endpoints
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed``
-parameter:
-
-============= ====================
-Value Behavior
-============= ====================
-``right'`` close right endpoint
-``'left'`` close left endpoint
-``'both'`` close both endpoints
-``'neither'`` open endpoints
-============= ====================
-
-For example, having the right endpoint open is useful in many problems that require that there is no contamination
-from present information back to past information. This allows the rolling window to compute statistics
-"up to that point in time", but not including that point in time.
-
-.. ipython:: python
-
- df = pd.DataFrame(
- {"x": 1},
- index=[
- pd.Timestamp("20130101 09:00:01"),
- pd.Timestamp("20130101 09:00:02"),
- pd.Timestamp("20130101 09:00:03"),
- pd.Timestamp("20130101 09:00:04"),
- pd.Timestamp("20130101 09:00:06"),
- ],
- )
-
- df["right"] = df.rolling("2s", closed="right").x.sum() # default
- df["both"] = df.rolling("2s", closed="both").x.sum()
- df["left"] = df.rolling("2s", closed="left").x.sum()
- df["neither"] = df.rolling("2s", closed="neither").x.sum()
-
- df
-
-
-.. _window.custom_rolling_window:
-
-Custom window rolling
-~~~~~~~~~~~~~~~~~~~~~
-
-.. versionadded:: 1.0
-
-In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts
-a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds.
-The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns
-a tuple of two arrays, the first being the starting indices of the windows and second being the
-ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed``
-and will automatically be passed to ``get_window_bounds`` and the defined method must
-always accept these arguments.
-
-For example, if we have the following :class:``DataFrame``:
-
-.. ipython:: python
-
- use_expanding = [True, False, True, False, True]
- use_expanding
- df = pd.DataFrame({"values": range(5)})
- df
-
-and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size
-1, we can create the following ``BaseIndexer`` subclass:
-
-.. code-block:: ipython
-
- In [2]: from pandas.api.indexers import BaseIndexer
- ...:
- ...: class CustomIndexer(BaseIndexer):
- ...:
- ...: def get_window_bounds(self, num_values, min_periods, center, closed):
- ...: start = np.empty(num_values, dtype=np.int64)
- ...: end = np.empty(num_values, dtype=np.int64)
- ...: for i in range(num_values):
- ...: if self.use_expanding[i]:
- ...: start[i] = 0
- ...: end[i] = i + 1
- ...: else:
- ...: start[i] = i
- ...: end[i] = i + self.window_size
- ...: return start, end
- ...:
-
- In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
-
- In [4]: df.rolling(indexer).sum()
- Out[4]:
- values
- 0 0.0
- 1 1.0
- 2 3.0
- 3 3.0
- 4 10.0
-
-You can view other examples of ``BaseIndexer`` subclasses `here `__
-
-.. versionadded:: 1.1
-
-One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows
-rolling operations over a non-fixed offset like a ``BusinessDay``.
-
-.. ipython:: python
-
- from pandas.api.indexers import VariableOffsetWindowIndexer
-
- df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10))
- offset = pd.offsets.BDay(1)
- indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset)
- df
- df.rolling(indexer).sum()
-
-For some problems knowledge of the future is available for analysis. For example, this occurs when
-each data point is a full time series read from an experiment, and the task is to extract underlying
-conditions. In these cases it can be useful to perform forward-looking rolling window computations.
-:func:`FixedForwardWindowIndexer ` class is available for this purpose.
-This :func:`BaseIndexer ` subclass implements a closed fixed-width
-forward-looking rolling window, and we can use it as follows:
-
-.. ipython:: ipython
-
- from pandas.api.indexers import FixedForwardWindowIndexer
- indexer = FixedForwardWindowIndexer(window_size=2)
- df.rolling(indexer, min_periods=1).sum()
-
-
-.. _window.rolling_apply:
-
-Rolling apply
-~~~~~~~~~~~~~
-
-The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs
-generic rolling computations. The ``func`` argument should be a single function
-that produces a single value from an ndarray input. ``raw`` specifies whether
-the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objects (``raw=True``).
-
-.. ipython:: python
-
- def mad(x):
- return np.fabs(x - x.mean()).mean()
-
- s = pd.Series(range(10))
- s.rolling(window=4).apply(mad, raw=True)
-
-
-.. _window.numba_engine:
-
-Numba engine
-~~~~~~~~~~~~
-
-.. versionadded:: 1.0
-
-Additionally, :meth:`~Rolling.apply` can leverage `Numba `__
-if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying
-``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``).
-Numba will be applied in potentially two routines:
-
-#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again.
-#. The engine will JIT the for loop where the apply function is applied to each window.
-
-The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the
-`numba.jit decorator `__.
-These keyword arguments will be applied to *both* the passed function (if a standard Python function)
-and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported,
-and their default values are set to ``False``, ``True`` and ``False`` respectively.
-
-.. note::
-
- In terms of performance, **the first time a function is run using the Numba engine will be slow**
- as Numba will have some function compilation overhead. However, the compiled functions are cached,
- and subsequent calls will be fast. In general, the Numba engine is performant with
- a larger amount of data points (e.g. 1+ million).
-
-.. code-block:: ipython
-
- In [1]: data = pd.Series(range(1_000_000))
-
- In [2]: roll = data.rolling(10)
-
- In [3]: def f(x):
- ...: return np.sum(x) + 5
- # Run the first time, compilation time will affect performance
- In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999
- 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
- # Function is cached and performance will improve
- In [5]: %timeit roll.apply(f, engine='numba', raw=True)
- 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
-
- In [6]: %timeit roll.apply(f, engine='cython', raw=True)
- 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-
-.. _window.cov_corr:
-
-Binary window functions
-~~~~~~~~~~~~~~~~~~~~~~~
-
-:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about
-two :class:`Series` or any combination of :class:`DataFrame`/:class:`Series` or
-:class:`DataFrame`/:class:`DataFrame`. Here is the behavior in each case:
-
-* two :class:`Series`: compute the statistic for the pairing.
-* :class:`DataFrame`/:class:`Series`: compute the statistics for each column of the DataFrame
- with the passed Series, thus returning a DataFrame.
-* :class:`DataFrame`/:class:`DataFrame`: by default compute the statistic for matching column
- names, returning a DataFrame. If the keyword argument ``pairwise=True`` is
- passed then computes the statistic for each pair of columns, returning a
- ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section
- `).
-
-For example:
-
-.. ipython:: python
-
- df = pd.DataFrame(
- np.random.randn(10, 4),
- index=pd.date_range("2020-01-01", periods=10),
- columns=["A", "B", "C", "D"],
- )
- df = df.cumsum()
-
- df2 = df[:4]
- df2.rolling(window=2).corr(df2["B"])
-
-.. _window.corr_pairwise:
-
-Computing rolling pairwise covariances and correlations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In financial data analysis and other fields it's common to compute covariance
-and correlation matrices for a collection of time series. Often one is also
-interested in moving-window covariance and correlation matrices. This can be
-done by passing the ``pairwise`` keyword argument, which in the case of
-:class:`DataFrame` inputs will yield a MultiIndexed :class:`DataFrame` whose ``index`` are the dates in
-question. In the case of a single DataFrame argument the ``pairwise`` argument
-can even be omitted:
-
-.. note::
-
- Missing values are ignored and each entry is computed using the pairwise
- complete observations. Please see the :ref:`covariance section
- ` for :ref:`caveats
- ` associated with this method of
- calculating covariance and correlation matrices.
-
-.. ipython:: python
-
- covs = (
- df[["B", "C", "D"]]
- .rolling(window=4)
- .cov(df[["A", "B", "C"]], pairwise=True)
- )
- covs
-
-
-.. _window.weighted:
-
-Weighted window
----------------
-
-The ``win_type`` argument in ``.rolling`` generates a weighted windows that are commonly used in filtering
-and spectral estimation. ``win_type`` must be string that corresponds to a `scipy.signal window function
-`__.
-Scipy must be installed in order to use these windows, and supplementary arguments
-that the Scipy window methods take must be specified in the aggregation function.
-
-
-.. ipython:: python
-
- s = pd.Series(range(10))
- s.rolling(window=5).mean()
- s.rolling(window=5, win_type="triang").mean()
- # Supplementary Scipy arguments passed in the aggregation function
- s.rolling(window=5, win_type="gaussian").mean(std=0.1)
-
-For all supported aggregation functions, see :ref:`api.functions_window`.
-
-.. _window.expanding:
-
-Expanding window
-----------------
-
-An expanding window yields the value of an aggregation statistic with all the data available up to that
-point in time. Since these calculations are a special case of rolling statistics,
-they are implemented in pandas such that the following two calls are equivalent:
-
-.. ipython:: python
-
- df = pd.DataFrame(range(5))
- df.rolling(window=len(df), min_periods=1).mean()
- df.expanding(min_periods=1).mean()
-
-For all supported aggregation functions, see :ref:`api.functions_expanding`.
-
-
-.. _window.exponentially_weighted:
-
-Exponentially Weighted window
------------------------------
-
-An exponentially weighted window is similar to an expanding window but with each prior point
-being exponentially weighted down relative to the current point.
-
-In general, a weighted moving average is calculated as
-
-.. math::
-
- y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i},
-
-where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i`
-are the weights.
-
-For all supported aggregation functions, see :ref:`api.functions_ewm`.
-
-The EW functions support two variants of exponential weights.
-The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i`
-which gives
-
-.. math::
-
- y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...
- + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
- + (1 - \alpha)^t}
-
-When ``adjust=False`` is specified, moving averages are calculated as
-
-.. math::
-
- y_0 &= x_0 \\
- y_t &= (1 - \alpha) y_{t-1} + \alpha x_t,
-
-which is equivalent to using weights
-
-.. math::
-
- w_i = \begin{cases}
- \alpha (1 - \alpha)^i & \text{if } i < t \\
- (1 - \alpha)^i & \text{if } i = t.
- \end{cases}
-
-.. note::
-
- These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g.
-
- .. math::
-
- y_t = \alpha' y_{t-1} + (1 - \alpha') x_t.
-
-The difference between the above two variants arises because we are
-dealing with series which have finite history. Consider a series of infinite
-history, with ``adjust=True``:
-
-.. math::
-
- y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...}
- {1 + (1 - \alpha) + (1 - \alpha)^2 + ...}
-
-Noting that the denominator is a geometric series with initial term equal to 1
-and a ratio of :math:`1 - \alpha` we have
-
-.. math::
-
- y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...}
- {\frac{1}{1 - (1 - \alpha)}}\\
- &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\
- &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\
- &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\
- &= \alpha x_t + (1 - \alpha) y_{t-1}
-
-which is the same expression as ``adjust=False`` above and therefore
-shows the equivalence of the two variants for infinite series.
-When ``adjust=False``, we have :math:`y_0 = x_0` and
-:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`.
-Therefore, there is an assumption that :math:`x_0` is not an ordinary value
-but rather an exponentially weighted moment of the infinite series up to that
-point.
-
-One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass
-:math:`\alpha` directly, it's often easier to think about either the
-**span**, **center of mass (com)** or **half-life** of an EW moment:
-
-.. math::
-
- \alpha =
- \begin{cases}
- \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\
- \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\
- 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0
- \end{cases}
-
-One must specify precisely one of **span**, **center of mass**, **half-life**
-and **alpha** to the EW functions:
-
-* **Span** corresponds to what is commonly called an "N-day EW moving average".
-* **Center of mass** has a more physical interpretation and can be thought of
- in terms of span: :math:`c = (s - 1) / 2`.
-* **Half-life** is the period of time for the exponential weight to reduce to
- one half.
-* **Alpha** specifies the smoothing factor directly.
-
-.. versionadded:: 1.1.0
-
-You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of
-time it takes for an observation to decay to half its value when also specifying a sequence
-of ``times``.
-
-.. ipython:: python
-
- df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
- df
- times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"]
- df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean()
-
-The following formula is used to compute exponentially weighted mean with an input vector of times:
-
-.. math::
-
- y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}},
-
-
-ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how
-intermediate null values affect the calculation of the weights.
-When ``ignore_na=False`` (the default), weights are calculated based on absolute
-positions, so that intermediate null values affect the result.
-When ``ignore_na=True``,
-weights are calculated by ignoring intermediate null values.
-For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted
-average of ``3, NaN, 5`` would be calculated as
-
-.. math::
-
- \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}.
-
-Whereas if ``ignore_na=True``, the weighted average would be calculated as
-
-.. math::
-
- \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}.
-
-The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument,
-specifying whether the result should contain biased or unbiased statistics.
-For example, if ``bias=True``, ``ewmvar(x)`` is calculated as
-``ewmvar(x) = ewma(x**2) - ewma(x)**2``;
-whereas if ``bias=False`` (the default), the biased variance statistics
-are scaled by debiasing factors
-
-.. math::
-
- \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}.
-
-(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor,
-with :math:`N = t + 1`.)
-See `Weighted Sample Variance `__
-on Wikipedia for further details.
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index c12adb2f1334f..4de76510c6bc1 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -419,7 +419,7 @@ Bug fixes
~~~~~~~~~
- Plotting functions now raise a ``TypeError`` before trying to plot anything
- if the associated objects have a dtype of ``object`` (:issue:`1818`,
+ if the associated objects have have a dtype of ``object`` (:issue:`1818`,
:issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to
numeric arrays if possible so that you can still plot, for example, an
object array with floats. This happens before any drawing takes place which
@@ -430,8 +430,8 @@ Bug fixes
- ``Series.str`` now supports iteration (:issue:`3638`). You can iterate over the
individual elements of each string in the ``Series``. Each iteration yields
- a ``Series`` with either a single character at each index of the original
- ``Series`` or ``NaN``. For example,
+ yields a ``Series`` with either a single character at each index of the
+ original ``Series`` or ``NaN``. For example,
.. ipython:: python
:okwarning:
diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
index b59938a9b9c9b..f2401c812a979 100644
--- a/doc/source/whatsnew/v0.14.0.rst
+++ b/doc/source/whatsnew/v0.14.0.rst
@@ -171,7 +171,7 @@ API changes
``expanding_cov``, ``expanding_corr`` to allow the calculation of moving
window covariance and correlation matrices (:issue:`4950`). See
:ref:`Computing rolling pairwise covariances and correlations
- ` in the docs.
+ ` in the docs.
.. code-block:: ipython
@@ -923,7 +923,7 @@ Bug fixes
- ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`)
- ``HDFStore.select_as_coordinates`` and ``select_column`` works with a ``where`` clause that results in filters (:issue:`6177`)
- Regression in join of non_unique_indexes (:issue:`6329`)
-- Issue with groupby ``agg`` with a single function and a mixed-type frame (:issue:`6337`)
+- Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`)
- Bug in ``DataFrame.replace()`` when passing a non- ``bool``
``to_replace`` argument (:issue:`6332`)
- Raise when trying to align on different levels of a MultiIndex assignment (:issue:`3738`)
diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst
index fc2b070df4392..1f054930b3709 100644
--- a/doc/source/whatsnew/v0.15.0.rst
+++ b/doc/source/whatsnew/v0.15.0.rst
@@ -405,7 +405,7 @@ Rolling/expanding moments improvements
- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that
the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those
- calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`)
+ calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`)
.. ipython:: python
@@ -490,7 +490,7 @@ Rolling/expanding moments improvements
now have an optional ``adjust`` argument, just like :func:`ewma` does,
affecting how the weights are calculated.
The default value of ``adjust`` is ``True``, which is backwards-compatible.
- See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`)
+ See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`)
- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr`
now have an optional ``ignore_na`` argument.
@@ -595,7 +595,7 @@ Rolling/expanding moments improvements
3 1.425439
dtype: float64
- See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`)
+ See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`)
.. _whatsnew_0150.sql:
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index b5b25796fea73..95ca925f18692 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -136,7 +136,7 @@ Enhancements
- Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files.
- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files.
-- Added ability to export Categorical data to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas.
+- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas.
- Added support for ``searchsorted()`` on ``Categorical`` class (:issue:`8420`).
Other enhancements:
diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst
index 269854111373f..39767684c01d0 100644
--- a/doc/source/whatsnew/v0.16.1.rst
+++ b/doc/source/whatsnew/v0.16.1.rst
@@ -6,7 +6,7 @@ Version 0.16.1 (May 11, 2015)
{{ header }}
-This is a minor bug-fix release from 0.16.0 and includes a large number of
+This is a minor bug-fix release from 0.16.0 and includes a a large number of
bug fixes along several new features, enhancements, and performance improvements.
We recommend that all users upgrade to this version.
@@ -72,7 +72,7 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv
Out[4]: Index(['c', 'a', 'b'], dtype='object')
-setting the index, will create a ``CategoricalIndex``
+setting the index, will create create a ``CategoricalIndex``
.. code-block:: ipython
diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst
index 37e8c64ea9ced..194bb61f2c1c8 100644
--- a/doc/source/whatsnew/v0.16.2.rst
+++ b/doc/source/whatsnew/v0.16.2.rst
@@ -6,7 +6,7 @@ Version 0.16.2 (June 12, 2015)
{{ header }}
-This is a minor bug-fix release from 0.16.1 and includes a large number of
+This is a minor bug-fix release from 0.16.1 and includes a a large number of
bug fixes along some new features (:meth:`~DataFrame.pipe` method), enhancements, and performance improvements.
We recommend that all users upgrade to this version.
diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst
index 829c04dac9f2d..ef5242b0e33c8 100644
--- a/doc/source/whatsnew/v0.18.0.rst
+++ b/doc/source/whatsnew/v0.18.0.rst
@@ -53,7 +53,7 @@ New features
Window functions are now methods
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`)
+Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`)
.. ipython:: python
@@ -610,7 +610,7 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1
pd.Timestamp('2012-01-01') - ser
-``NaT.isoformat()`` now returns ``'NaT'``. This change allows
+``NaT.isoformat()`` now returns ``'NaT'``. This change allows allows
``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat
(:issue:`12300`).
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index 340e1ce9ee1ef..2ac7b0f54361b 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -135,7 +135,7 @@ Method ``.rolling()`` is now time-series aware
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`).
-See the full documentation :ref:`here `.
+See the full documentation :ref:`here `.
.. ipython:: python
diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
index 2cb8e13e9a18a..a9e57f0039735 100644
--- a/doc/source/whatsnew/v0.20.0.rst
+++ b/doc/source/whatsnew/v0.20.0.rst
@@ -459,7 +459,7 @@ Selecting via a scalar value that is contained *in* the intervals.
Other enhancements
^^^^^^^^^^^^^^^^^^
-- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`)
+- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`)
- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `.
- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`)
@@ -988,7 +988,7 @@ A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a `
will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated,
see :ref:`here `. These are equivalent in function,
but a MultiIndexed ``DataFrame`` enjoys more support in pandas.
-See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`)
+See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`)
.. ipython:: python
@@ -1167,7 +1167,7 @@ Other API changes
- ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`)
- ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`)
- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
-- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than one byte (:issue:`11592`)
+- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
@@ -1315,7 +1315,7 @@ The recommended methods of indexing are:
- ``.loc`` if you want to *label* index
- ``.iloc`` if you want to *positionally* index.
-Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `__.
+Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `.
.. ipython:: python
@@ -1663,11 +1663,11 @@ Indexing
- Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`)
- Bug in ``.reset_index()`` when raising error for index name already present in ``MultiIndex`` columns (:issue:`16120`)
- Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`)
-- Bug in the HTML display with a ``MultiIndex`` and truncation (:issue:`14882`)
+- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`)
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`)
- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`)
-- Bug in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
+- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`)
- Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`)
IO
diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst
index 1bbbbdc7e5410..6035b89aa8643 100644
--- a/doc/source/whatsnew/v0.21.0.rst
+++ b/doc/source/whatsnew/v0.21.0.rst
@@ -50,7 +50,7 @@ Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, su
dtypes, including extension dtypes such as datetime with timezones.
This functionality depends on either the `pyarrow `__ or `fastparquet `__ library.
-For more details, see :ref:`the IO docs on Parquet `.
+For more details, see see :ref:`the IO docs on Parquet `.
.. _whatsnew_0210.enhancements.infer_objects:
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index ce784231a47d2..9ef50045d5b5e 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1622,7 +1622,7 @@ Timedelta
- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`)
- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`)
- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`)
-- Bug in :class:`Series` with numeric dtype when adding or subtracting an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`)
+- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`)
- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`)
- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`)
- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`)
@@ -1868,7 +1868,7 @@ Reshaping
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`)
-- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`)
- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`)
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst
index 253ca4d4188e5..8ff688eaa91e7 100644
--- a/doc/source/whatsnew/v0.6.0.rst
+++ b/doc/source/whatsnew/v0.6.0.rst
@@ -15,7 +15,7 @@ New features
~~~~~~~~~~~~
- :ref:`Added ` ``melt`` function to ``pandas.core.reshape``
- :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`)
-- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to DataFrame (:issue:`296`)
+- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (:issue:`296`)
- :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (:issue:`289`)
- :ref:`Added ` ``float_format`` option to ``Series.to_string``
- :ref:`Added ` ``skip_footer`` (:issue:`291`) and ``converters`` (:issue:`343`) options to ``read_csv`` and ``read_table``
diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst
index 139c6e2d1cb0c..8ee80fa2c44b1 100644
--- a/doc/source/whatsnew/v0.6.1.rst
+++ b/doc/source/whatsnew/v0.6.1.rst
@@ -25,12 +25,12 @@ New features
constructor (:issue:`444`)
- DataFrame.convert_objects method for :ref:`inferring better dtypes `
for object columns (:issue:`302`)
-- Add :ref:`rolling_corr_pairwise ` function for
+- Add :ref:`rolling_corr_pairwise ` function for
computing Panel of correlation matrices (:issue:`189`)
- Add :ref:`margins ` option to :ref:`pivot_table
` for computing subgroup aggregates (:issue:`114`)
- Add ``Series.from_csv`` function (:issue:`482`)
-- :ref:`Can pass ` DataFrame/DataFrame and
+- :ref:`Can pass ` DataFrame/DataFrame and
DataFrame/Series to rolling_corr/rolling_cov (GH #462)
- MultiIndex.get_level_values can :ref:`accept the level name `
diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst
index 781054fc4de7c..b34c2a5c6a07c 100644
--- a/doc/source/whatsnew/v0.8.0.rst
+++ b/doc/source/whatsnew/v0.8.0.rst
@@ -81,7 +81,7 @@ Time Series changes and improvements
timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time
zone set will be localized to local time. Time zone conversions are therefore
essentially free. User needs to know very little about pytz library now; only
- time zone names as strings are required. Time zone-aware timestamps are
+ time zone names as as strings are required. Time zone-aware timestamps are
equal if and only if their UTC timestamps match. Operations between time
zone-aware time series with different time zones will result in a UTC-indexed
time series.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 6512e4cce02a9..8f9ceb30a947a 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -46,7 +46,7 @@ We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply`
that allows the user to execute the routine using `Numba `__ instead of Cython.
Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and
the data set is larger (1 million rows or greater). For more details, see
-:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`)
+:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`)
.. _whatsnew_100.custom_window:
@@ -57,7 +57,7 @@ We've added a :func:`pandas.api.indexers.BaseIndexer` class that allows users to
window bounds are created during ``rolling`` operations. Users can define their own ``get_window_bounds``
method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate the start and end
indices used for each window during the rolling aggregation. For more details and example usage, see
-the :ref:`custom window rolling documentation `
+the :ref:`custom window rolling documentation `
.. _whatsnew_100.to_markdown:
diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst
index 46c4ad4f35fe4..a29ae1912e338 100644
--- a/doc/source/whatsnew/v1.1.5.rst
+++ b/doc/source/whatsnew/v1.1.5.rst
@@ -14,15 +14,8 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
-- Fixed regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`)
-- Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`)
-- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`)
-- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`)
-- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`)
-- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
-- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
-- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
-- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
+-
+-
.. ---------------------------------------------------------------------------
@@ -30,15 +23,10 @@ Fixed regressions
Bug fixes
~~~~~~~~~
-- Bug in pytables methods in python 3.9 (:issue:`38041`)
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_115.other:
-
-Other
-~~~~~
-- Only set ``-Werror`` as a compiler flag in the CI jobs (:issue:`33315`, :issue:`33314`)
+- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`)
+- Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
+- Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
+-
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index ac8132339d38c..d1899e1d72509 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -22,7 +22,7 @@ Optionally disallow duplicate labels
control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to
prevent accidental introduction of duplicate labels, which can affect downstream operations.
-By default, duplicates continue to be allowed.
+By default, duplicates continue to be allowed
.. ipython:: python
@@ -84,8 +84,7 @@ Support for binary file handles in ``to_csv``
:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`)
with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`).
-If pandas does not automatically detect whether the file handle is opened in binary or text mode,
-it is necessary to provide ``mode="wb"``.
+``mode`` has to contain a ``b`` for binary handles to be supported.
For example:
@@ -95,7 +94,7 @@ For example:
data = pd.DataFrame([0, 1, 2])
buffer = io.BytesIO()
- data.to_csv(buffer, encoding="utf-8", compression="gzip")
+ data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
Support for short caption and table position in ``to_latex``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -104,7 +103,7 @@ Support for short caption and table position in ``to_latex``
a floating table position (:issue:`35281`)
and a short caption (:issue:`36267`).
-The keyword ``position`` has been added to set the position.
+New keyword ``position`` is implemented to set the position.
.. ipython:: python
@@ -112,9 +111,9 @@ The keyword ``position`` has been added to set the position.
table = data.to_latex(position='ht')
print(table)
-Usage of the keyword ``caption`` has been extended.
+Usage of keyword ``caption`` is extended.
Besides taking a single string as an argument,
-one can optionally provide a tuple ``(full_caption, short_caption)``
+one can optionally provide a tuple of ``(full_caption, short_caption)``
to add a short caption macro.
.. ipython:: python
@@ -141,12 +140,12 @@ parser by default should have no impact on performance. (:issue:`17154`)
Experimental nullable data types for float data
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`.
-These are extension data types dedicated to floating point data that can hold the
+We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`,
+an extension data type dedicated to floating point data that can hold the
``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`).
While the default float data type already supports missing values using ``np.nan``,
-these new data types use ``pd.NA`` (and its corresponding behaviour) as the missing
+this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing
value indicator, in line with the already existing nullable :ref:`integer `
and :ref:`boolean ` data types.
@@ -180,7 +179,7 @@ Alternatively, you can also use the dtype object:
.. warning::
- Experimental: the new floating data types are currently experimental, and their
+ Experimental: the new floating data types are currently experimental, and its
behaviour or API may still change without warning. Especially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
@@ -189,8 +188,8 @@ Alternatively, you can also use the dtype object:
Index/column name preservation when aggregating
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, pandas
-will now attempt to preserve index and column names whenever possible (:issue:`35847`).
+When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, Pandas
+will attempt to preserve index (and column) names whenever possible (:issue:`35847`).
In the case where all inputs share a common name, this name will be assigned to the
result. When the input names do not all agree, the result will be unnamed. Here is an
example where the index name is preserved:
@@ -248,164 +247,37 @@ By defualt, backward resample uses ``closed=right`` while ``closed=left`` is als
ts.resample("17min", closed="left", origin="end").sum()
-.. _whatsnew_120.groupby_ewm:
-
-Groupby supports EWM operations directly
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-:class:`.DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`).
-
-.. ipython:: python
-
- df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)})
- df
- df.groupby('A').ewm(com=1.0).mean()
-
-Additionally ``mean`` supports execution via `Numba `__ with
-the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency
-to use this feature.
-
.. _whatsnew_120.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
-- Added ``day_of_week`` (compatibility alias ``dayofweek``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`)
-- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`)
-- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`)
+- Added ``day_of_week``(compatibility alias ``dayofweek``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`)
+- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to ``Timestamp``, ``DatetimeIndex``, ``Period``, ``PeriodIndex`` (:issue:`9605`)
+- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`)
- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`)
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
- :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`)
-- :meth:`.Styler.set_table_styles` now allows the direct styling of rows and columns and can be chained (:issue:`35607`)
-- :class:`.Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
-- :meth:`.Rolling.mean` and :meth:`.Rolling.sum` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
-- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
+- ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
+- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
+- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
+-
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
-- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`).
-- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
-- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
-- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`)
-- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`)
-- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`)
+- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
+- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
+- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)
- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`)
-- :class:`DataFrame` now supports the ``divmod`` operation (:issue:`37165`)
+- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`)
- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`)
-- :class:`.Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`)
-- :class:`.DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`)
+- :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`)
+- :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`)
- :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
-- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`)
-- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
-- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
-- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
-- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
-
-.. ---------------------------------------------------------------------------
-
-.. _whatsnew_120.notable_bug_fixes:
-
-Notable bug fixes
-~~~~~~~~~~~~~~~~~
-
-These are bug fixes that might have notable behavior changes.
-
-Consistency of DataFrame Reductions
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now
-determines whether to exclude object-dtype columns on a column-by-column basis,
-instead of checking if *all* object-dtype columns can be considered boolean.
-
-This prevents pathological behavior where applying the reduction on a subset
-of columns could result in a larger Series result. See (:issue:`37799`).
-
-.. ipython:: python
-
- df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object)
- df["C"] = pd.Series([True, True])
-
-
-*Previous behavior*:
-
-.. code-block:: ipython
-
- In [5]: df.all(bool_only=True)
- Out[5]:
- C True
- dtype: bool
-
- In [6]: df[["B", "C"]].all(bool_only=True)
- Out[6]:
- B False
- C True
- dtype: bool
-
-*New behavior*:
-
-.. ipython:: python
-
- In [5]: df.all(bool_only=True)
-
- In [6]: df[["B", "C"]].all(bool_only=True)
-
-
-Other DataFrame reductions with ``numeric_only=None`` will also avoid
-this pathological behavior (:issue:`37827`):
-
-.. ipython:: python
-
- df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
-
-
-*Previous behavior*:
-
-.. code-block:: ipython
-
- In [3]: df.mean()
- Out[3]: Series([], dtype: float64)
-
- In [4]: df[["A"]].mean()
- Out[4]:
- A 1.0
- dtype: float64
-
-*New behavior*:
-
-.. ipython:: python
-
- df.mean()
-
- df[["A"]].mean()
-
-Moreover, DataFrame reductions with ``numeric_only=None`` will now be
-consistent with their Series counterparts. In particular, for
-reductions where the Series method raises ``TypeError``, the
-DataFrame reduction will now consider that column non-numeric
-instead of casting to a NumPy array which may have different semantics (:issue:`36076`,
-:issue:`28949`, :issue:`21020`).
-
-.. ipython:: python
-
- ser = pd.Series([0, 1], dtype="category", name="A")
- df = ser.to_frame()
-
-
-*Previous behavior*:
-
-.. code-block:: ipython
-
- In [5]: df.any()
- Out[5]:
- A True
- dtype: bool
-
-*New behavior*:
-
-.. ipython:: python
-
- df.any()
-
+- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
+- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
+- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
.. _whatsnew_120.api_breaking.python:
@@ -493,11 +365,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
Other API changes
^^^^^^^^^^^^^^^^^
-- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)
-- Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`.DatetimeArray.take`, :meth:`TimedeltaArray.take`, or :meth:`PeriodArray.take` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
-- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
-- Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
-- Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`)
+- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)
.. ---------------------------------------------------------------------------
@@ -506,31 +374,24 @@ Other API changes
Deprecations
~~~~~~~~~~~~
- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`)
-- Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`)
-- Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`)
+- Deprecated parameter ``dtype`` in :meth:`~Index.copy` on method all index classes. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`)
+- Deprecated parameters ``levels`` and ``codes`` in :meth:`~MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`)
- Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`)
- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`)
-- The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`)
-- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]``
- (given the ambiguity whether it is indexing the rows or selecting a column), use
- ``df.loc[string]`` instead (:issue:`36179`)
-- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`.DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`)
+- The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`)
+- Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`)
+- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`)
- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`)
- Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`)
- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)
-- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`)
- Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`)
- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)
- Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`)
- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`)
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`)
- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`)
-- Partial slicing on unordered :class:`.DatetimeIndex` objects with keys that are not in the index is deprecated and will be removed in a future version (:issue:`18531`)
-- The ``how`` keyword in :meth:`PeriodIndex.astype` is deprecated and will be removed in a future version, use ``index.to_timestamp(how=how)`` instead (:issue:`37982`)
-- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`)
-- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
-- The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`)
+
.. ---------------------------------------------------------------------------
@@ -541,22 +402,19 @@ Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`)
-- Performance improvement in :meth:`.GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
-- Performance improvements when creating :meth:`Series.map` from a huge dictionary (:issue:`34717`)
-- Performance improvement in :meth:`.GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
-- :class:`.Styler` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
-- Performance improvement in :func:`to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`)
-- Performance improvement in setting values on an :class:`IntervalArray` (:issue:`36310`)
+- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
+- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
+- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
+- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
+- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`)
+- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`)
- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes,
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
-- Performance improvement in :meth:`.RollingGroupby.count` (:issue:`35625`)
-- Small performance decrease to :meth:`.Rolling.min` and :meth:`.Rolling.max` for fixed windows (:issue:`36567`)
+- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)
+- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`)
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
-- Faster ``dir`` calls when the object has many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
+- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
-- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
-- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
-- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
.. ---------------------------------------------------------------------------
@@ -567,41 +425,35 @@ Bug fixes
Categorical
^^^^^^^^^^^
-- :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`)
+- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`)
- Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`)
- Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`)
-- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`)
--
Datetimelike
^^^^^^^^^^^^
-- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`)
+- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`)
- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`)
-- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`)
-- Bug in :meth:`.DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`.DatetimeIndex` (:issue:`35690`)
-- Bug in :meth:`.DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`)
-- Bug in :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or :class:`Period` dtype placement of ``NaT`` values being inconsistent with NumPy (:issue:`36176`, :issue:`36254`)
-- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`)
-- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`)
-- Bug in :class:`.DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`)
-- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`)
-- Bug in :meth:`.DatetimeIndex.equals` and :meth:`.TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`)
-- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement timezone parsing when orient structure is ``table`` (:issue:`35973`)
-- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred timezone from string (:issue:`35973`)
-- Bug in :meth:`.TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`)
-- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`)
-- Bug in adding a :class:`.BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`)
+- Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`)
+- Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`)
+- Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`)
+- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`)
+- Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`)
+- Bug in :meth:`DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`)
+- Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`)
+- :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`)
+- Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`)
+- :meth:`to_json` and :meth:`read_json` now implements timezones parsing when orient structure is 'table'.
+- :meth:`astype` now attempts to convert to 'datetime64[ns, tz]' directly from 'object' with inferred timezone from string (:issue:`35973`).
+- Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`)
+- Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`)
+- Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`)
- Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`)
-- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`)
-- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`)
-- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`)
Timedelta
^^^^^^^^^
-- Bug in :class:`.TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`)
-- Bug in parsing of ISO 8601 durations in :class:`Timedelta` and :func:`to_datetime` (:issue:`29773`, :issue:`36204`)
+- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`)
+- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`)
- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`)
-- Bug in :class:`Timedelta` incorrectly truncating to sub-second portion of a string input when it has precision higher than nanoseconds (:issue:`36738`)
Timezones
^^^^^^^^^
@@ -615,17 +467,17 @@ Numeric
- Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`)
- Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`)
- Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`)
-- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`)
-- Bug in :mod:`pandas.testing` module functions when used with ``check_exact=False`` on complex numeric types (:issue:`28235`)
+- Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`)
+- Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`)
- Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`)
- Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`)
-- Bug in :class:`.IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`)
+- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`)
- Bug in :class:`MultiIndex` comparison with tuple incorrectly treating tuple as array-like (:issue:`21517`)
- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`)
- Bug in :class:`DataFrame` arithmetic ops incorrectly accepting keyword arguments (:issue:`36843`)
-- Bug in :class:`.IntervalArray` comparisons with :class:`Series` not returning Series (:issue:`36908`)
+- Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`)
- Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`)
-- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`)
+- Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`)
Conversion
@@ -637,53 +489,36 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`)
-- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype Series containing only numeric strings and ``NA`` (:issue:`37262`)
+- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype :class:`Series` containing only numeric strings and ``NA`` (:issue:`37262`)
-
Interval
^^^^^^^^
-
-- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`)
- Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`)
-- Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`)
-- Bug in :meth:`IntervalArray.astype` incorrectly dropping dtype information with a :class:`CategoricalDtype` object (:issue:`37984`)
+-
-
Indexing
^^^^^^^^
-- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
+- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
-- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`)
+- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`)
- Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`)
- Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`)
- Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`)
- Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`)
-- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`)
+- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` with a level named "0" (:issue:`37194`)
- Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`)
- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`)
-- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`)
-- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`)
-- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`)
-- Bug in :meth:`DataFrame.loc.__setitem__` expanding an empty :class:`DataFrame` with mixed dtypes (:issue:`37932`)
-- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`)
-- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`)
-- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`)
-- Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`)
-- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`)
-- Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`)
-- Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`)
-- Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`)
-- Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`)
-- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`)
-- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`)
+- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`)
+- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`)
Missing
^^^^^^^
-- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`)
-- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`)
+- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`)
-
MultiIndex
@@ -692,103 +527,80 @@ MultiIndex
- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
-- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`)
I/O
^^^
- :func:`read_sas` no longer leaks resources on failure (:issue:`35566`)
-- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
-- Bug in :meth:`read_csv` with ``float_precision='round_trip'`` did not handle ``decimal`` and ``thousands`` parameters (:issue:`35365`)
+- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
+- In :meth:`read_csv` ``float_precision='round_trip'`` now handles ``decimal`` and ``thousands`` parameters (:issue:`35365`)
- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`)
- :meth:`to_csv` passes compression arguments for ``'gzip'`` always to ``gzip.GzipFile`` (:issue:`28103`)
- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`)
-- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, :issue:`32392`)
-- :meth:`DataFrame.to_pickle`, :meth:`Series.to_pickle`, and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, :issue:`29570`)
+- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`)
+- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`)
- Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`)
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
- Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`)
-- Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when used with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`)
+- Bug in :meth:`to_json` with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`)
- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`)
- Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`)
- Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`)
-- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty DataFrame with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
-- Bug in :class:`HDFStore` was dropping timezone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
+- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
+- Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`)
- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`)
- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`)
-- :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`)
-- Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`)
-- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`)
-- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`)
-- :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`)
-- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`)
-
-Period
-^^^^^^
-
-- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Period` dtypes would be converted to object dtypes (:issue:`34871`)
Plotting
^^^^^^^^
- Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`)
-- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`)
-- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`)
+- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`)
- Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`)
-- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was
- indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`)
-- Bug in :meth:`.DataFrameGroupBy.boxplot` when ``subplots=False`` would raise a ``KeyError`` (:issue:`16748`)
-- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no ``sharey`` parameter was passed (:issue:`37942`)
+- Bug in :meth:`DataFrameGroupBy.boxplot` when ``subplots=False``, a KeyError would raise (:issue:`16748`)
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in :meth:`.DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
-- Bug in :meth:`.DataFrameGroupBy.apply` that would sometimes throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
-- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`)
-- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`)
-- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`)
+- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
+- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
+- Bug in :meth:`DataFrame.resample(...)` that would throw a ``ValueError`` when resampling from "D" to "24H" over a transition into daylight savings time (DST) (:issue:`35219`)
+- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`)
+- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`)
-- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`)
+- Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`)
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
-- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
-- Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`)
-- Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`)
+- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
+- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`)
+- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`)
- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`)
-- Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`)
-- Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`)
-- Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`)
-- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`)
-- Bug in :meth:`.Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`)
-- Using :meth:`.Rolling.var` instead of :meth:`.Rolling.std` avoids numerical issues for :meth:`.Rolling.corr` when :meth:`.Rolling.var` is still within floating point precision while :meth:`.Rolling.std` is not (:issue:`31286`)
-- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.Resampler.quantile` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`)
-- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` returned wrong values for :class:`.BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)
+- Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`)
+- Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`)
+- Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`).
+- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`)
+- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`)
+- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`)
+- Bug in :meth:`df.groupby(..).quantile() ` and :meth:`df.resample(..).quantile() ` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`)
+- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)
- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
-- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`)
-- Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`)
Reshaping
^^^^^^^^^
- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
-- Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`)
+- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`)
- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`)
-- Bug in :meth:`DataFrame.stack` where an empty DataFrame.stack would raise an error (:issue:`36113`). Now returning an empty Series with empty MultiIndex.
-- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ValueError. (:issue:`36113`)
- Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`)
-- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was a dictionary (:issue:`35811`)
-- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns are both multiindexed (:issue:`36360`)
-- Bug in :meth:`DataFrame.pivot` modified ``index`` argument when ``columns`` was passed but ``values`` was not (:issue:`37635`)
-- Bug in :meth:`DataFrame.join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`)
-- Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
-- Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`)
+- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`)
+- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns both multiindexed (:issue:`36360`)
+- Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`)
+- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
+- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
-- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
-- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`)
Sparse
^^^^^^
@@ -799,26 +611,23 @@ Sparse
ExtensionArray
^^^^^^^^^^^^^^
-- Fixed bug where :class:`DataFrame` column set to scalar extension type via a dict instantiation was considered an object type rather than the extension type (:issue:`35965`)
-- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`28488`)
-- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`)
-- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`)
-- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`)
+- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`)
+- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`)
+- Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`)
+- Fixed an inconsistency in :class:`PeriodArray`'s ``__init__`` signature to those of :class:`DatetimeArray` and :class:`TimedeltaArray` (:issue:`37289`)
+- Reductions for :class:`BooleanArray`, :class:`Categorical`, :class:`DatetimeArray`, :class:`FloatingArray`, :class:`IntegerArray`, :class:`PeriodArray`, :class:`TimedeltaArray`, and :class:`PandasArray` are now keyword-only methods (:issue:`37541`)
Other
^^^^^
-- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising an ``AssertionError`` instead of a ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
+- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
-- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series and DataFrames (:issue:`28283`)
-- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`)
- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`)
-- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`)
-- Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`)
-- Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`)
-- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`)
-- Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`)
-- Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`)
+- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`)
+- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)
+- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`)
+- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`).
+- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`)
.. ---------------------------------------------------------------------------
diff --git a/environment.yml b/environment.yml
index b99b856187fb6..77a9c5fd4822d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,9 +12,6 @@ dependencies:
- asv
# building
- # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms.
- - c-compiler
- - cxx-compiler
- cython>=0.29.21
# code checks
diff --git a/pandas/__init__.py b/pandas/__init__.py
index cc5d835a52833..cf7ae2505b72d 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -33,7 +33,7 @@
raise ImportError(
f"C extension: {module} not built. If you want to import "
"pandas from the source directory, you may need to run "
- "'python setup.py build_ext --force' to build the C extensions first."
+ "'python setup.py build_ext --inplace --force' to build the C extensions first."
) from e
from pandas._config import (
@@ -189,10 +189,25 @@
# GH 27101
+# TODO: remove Panel compat in 1.0
def __getattr__(name):
import warnings
- if name == "datetime":
+ if name == "Panel":
+
+ warnings.warn(
+ "The Panel class is removed from pandas. Accessing it "
+ "from the top-level namespace will also be removed in the next version",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ class Panel:
+ pass
+
+ return Panel
+
+ elif name == "datetime":
warnings.warn(
"The pandas.datetime class is deprecated "
"and will be removed from pandas in a future version. "
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 24156c88f0d76..5a958d5e0bd3c 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -903,12 +903,13 @@ def group_last(rank_t[:, :] out,
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False
+ assert min_count == -1, "'min_count' only used in add and prod"
+
# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")
- min_count = max(min_count, 1)
nobs = np.zeros((