Skip to content

BUG fixing module first() and DateOffset #52487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Bug fixes
- Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`)
- Bug in logical and comparison operations between :class:`ArrowDtype` and numpy masked types (e.g. ``"boolean"``) (:issue:`52625`)
- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`)
- Fixed bug in :meth:`DataFrame.first` when used with a :class:`DateOffset` (:issue:`45908`)
- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`)

.. ---------------------------------------------------------------------------
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
Period,
Tick,
Timestamp,
offsets,
to_offset,
)
from pandas._typing import (
Expand Down Expand Up @@ -9091,16 +9092,25 @@ def first(self, offset) -> Self:
if len(self.index) == 0:
return self.copy(deep=False)

offset = to_offset(offset)
if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
if isinstance(offset, offsets.DateOffset):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't the offset = to_offset(offset) below make this check unnecessary?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the issue is that doing this check after to_offset would make it impossible to distinguish between MonthEnd, as that would also satisfy isinstance(offset, DateOffset)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they need to be handled differently because something like DateOffset(days=15) would always return True for if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]), and so would be shifted backward by offset.base (this is why in the linked issues, the result only has a single row for DateOffset input)

input_is_offset = True
else:
input_is_offset = False
offset = to_offset(offset)

if (
not isinstance(offset, Tick)
and offset.is_on_offset(self.index[0])
and not input_is_offset
):
# GH#29623 if first value is end of period, remove offset with n = 1
# before adding the real offset
end_date = end = self.index[0] - offset.base + offset
else:
end_date = end = self.index[0] + offset

# Tick-like, e.g. 3 weeks
if isinstance(offset, Tick) and end_date in self.index:
if isinstance(offset, Tick) or input_is_offset and end_date in self.index:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can never remember if a or b and c is evaluated as (a or b) and c or a or (b and c), could we add parantheses please?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll change that.

end = self.index.searchsorted(end_date, side="left")
return self.iloc[:end]

Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/frame/methods/test_first_and_last.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Note: includes tests for `last`
"""
import numpy as np
import pytest

import pandas as pd
Expand Down Expand Up @@ -95,3 +96,50 @@ def test_empty_not_input(self, func):
result = getattr(df, func)(offset=1)
tm.assert_frame_equal(df, result)
assert df is not result

@pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)])
def test_last_day_of_months_with_date_offset(self, frame_or_series, start, periods):
x = frame_or_series([1] * 100, index=pd.date_range(start, periods=100))
result = x.first(pd.DateOffset(days=periods))
expected = frame_or_series(
[1] * periods, index=pd.date_range(start, periods=periods)
)
tm.assert_equal(result, expected)

def test_date_offset_multiple_days(self, frame_or_series):
x = frame_or_series([1] * 100, index=pd.date_range("2010-03-31", periods=100))
result = x.first(pd.DateOffset(days=2))
expected = frame_or_series(
[1] * 2, index=pd.date_range("2010-03-31", "2010-04-01")
)
tm.assert_equal(result, expected)

def test_first_with_date_offset(self):
# GH#51284
i = pd.to_datetime(["2018-04-09", "2018-04-10", "2018-04-11", "2018-04-12"])
x = DataFrame({"A": [1, 2, 3, 4]}, index=i)
result = x.first(pd.DateOffset(days=2))
expected = DataFrame(
{"A": [1, 2]}, index=pd.to_datetime(["2018-04-09", "2018-04-10"])
)
tm.assert_equal(result, expected)

def test_date_offset_15_days(self):
# GH#45908
i = pd.date_range("2018-04-09", periods=30, freq="2D")
x = DataFrame({"A": np.arange(30)}, index=i)
result = x.first(pd.DateOffset(days=15))
i2 = pd.date_range("2018-04-09", periods=8, freq="2D")
expected = DataFrame({"A": np.arange(8)}, index=i2)
tm.assert_equal(result, expected)

def test_first_with_date_offset_months(self, frame_or_series):
periods = 40
x = frame_or_series(
[1] * periods, index=pd.date_range("2010-03-31", periods=periods)
)
result = x.first(pd.DateOffset(months=1))
expected = frame_or_series(
[1] * 30, index=pd.date_range("2010-03-31", periods=30)
)
tm.assert_equal(result, expected)