Skip to content

BUG fixing module first() and DateOffset #52487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
from pandas._libs.tslibs import (
Period,
Tick,
Timedelta,
Timestamp,
offsets,
to_offset,
)
from pandas._typing import (
Expand Down Expand Up @@ -9076,9 +9078,20 @@ def first(self, offset) -> Self:

if len(self.index) == 0:
return self.copy(deep=False)
if type(offset) is not offsets.DateOffset:
offset = to_offset(offset)

offset = to_offset(offset)
if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
if type(offset) is offsets.DateOffset and offset.is_on_offset(self.index[0]):
end_date = self.index[0] - Timedelta(1, unit="d") + offset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can hard-code Timedelta(1, unit='d'), e.g. it would break DateOffset(minutes=2):

(Pdb) p self.index[0]
Timestamp('2018-04-09 00:00:00')
(Pdb) p self.index[0] - Timedelta(1, unit='d')
Timestamp('2018-04-08 00:00:00')
(Pdb) p self.index[0] - Timedelta(1, unit='d') + offset
Timestamp('2018-04-08 00:02:00')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the review!
Yes, we should not hard-code like that, I did it as a draft. I wanted to show how different the behavior of DateOffset is from the "Tick" object.
The whole first() was not implemented for DateOffset, there were no tests either.
Then there are bugs on it (on DateOffset itself).
In the end, I wanted to know if patching first() was correct, if DateOffset will be deprecated or if it should be fixed.

Some of the issues of DateOffset:

  1. DateOffset.is_on_offset is different than the offset done with a string.
  2. DateOffset.name doesn't work
  3. At some point adding an offset to it, was going backwards! (as in self.index[0] + offset = earlier date)


return self.loc[:end_date]

# if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
if (
not isinstance(offset, Tick)
and offset.is_on_offset(self.index[0])
and type(offset) is not offsets.DateOffset
):
# GH#29623 if first value is end of period, remove offset with n = 1
# before adding the real offset
end_date = end = self.index[0] - offset.base + offset
Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/frame/methods/test_first_and_last.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""
Note: includes tests for `last`
"""
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
Series,
bdate_range,
)

# from pandas._libs.tslibs import offsets
import pandas._testing as tm


Expand Down Expand Up @@ -95,3 +99,39 @@ def test_empty_not_input(self, func):
result = getattr(df, func)(offset=1)
tm.assert_frame_equal(df, result)
assert df is not result

@pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)])
def test_first_with_first_day_last_of_m_DO(self, frame_or_series, start, periods):
x = frame_or_series([1] * 100, index=bdate_range(start, periods=100))
result = x.first(pd.DateOffset(days=periods))
expected = frame_or_series(
[1] * periods, index=bdate_range(start, periods=periods)
)
tm.assert_equal(result, expected)

def test_first_with_first_day_end_of_frq_n_greater_one_DO(self, frame_or_series):
x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100))
result = x.first(pd.DateOffset(days=2))
expected = frame_or_series(
[1] * 2, index=bdate_range("2010-03-31", "2010-04-01")
)
tm.assert_equal(result, expected)

def test_first_w_DateOffset(self):
# GH#51284
i = pd.date_range("2018-04-09", periods=4, freq="2D")
x = DataFrame({"A": [1, 2, 3, 4]}, index=i)
result = x.first(pd.DateOffset(days=3))
expected = DataFrame(
{"A": [1, 2]}, index=pd.date_range("2018-04-09", periods=2, freq="2D")
)
tm.assert_equal(result, expected)

def test_first_w_DateOffset_other(self):
# GH#45908
i = pd.date_range("2018-04-09", periods=30, freq="2D")
x = DataFrame({"A": Series(np.arange(30), index=i)}, index=i)
result = x.first(pd.DateOffset(days=15))
i2 = pd.date_range("2018-04-09", periods=8, freq="2D")
expected = DataFrame({"A": Series(np.arange(8), index=i2)}, index=i2)
tm.assert_equal(result, expected)