Skip to content

BUG: Min/max does not work for dates with timezones if there are missing values in the data frame #44222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ Missing
^^^^^^^
- Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`)
- Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`)
-
- Bug in :meth:`DataFrame.max`, :meth:`DataFrame.min`, :meth:`Series.max` and :meth:`Series.min` when called on datetime columns with timezone aware data and missing elements (:issue:`27794` and :issue:`44196`)

MultiIndex
^^^^^^^^^^
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9979,7 +9979,11 @@ def _get_data() -> DataFrame:
data = self._get_bool_data()
return data

if numeric_only is not None or axis == 0:
if (
numeric_only is not None
or axis == 0
or (name in ["max", "min"] and axis == 1)
):
# For numeric_only non-None and axis non-None, we know
# which blocks to use and no try/except is needed.
# For numeric_only=None only the case with axis==0 and no object
Expand Down
142 changes: 141 additions & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
DataFrame,
Index,
MultiIndex,
PeriodDtype,
Series,
Timedelta,
Timestamp,
date_range,
isna,
Expand Down Expand Up @@ -756,7 +758,7 @@ def test_operators_timedelta64(self):
# excludes numeric
with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
result = mixed.min(axis=1)
expected = Series([1, 1, 1.0], index=[0, 1, 2])
expected = Series([])
tm.assert_series_equal(result, expected)

# works when only those columns are selected
Expand Down Expand Up @@ -1763,3 +1765,141 @@ def test_prod_sum_min_count_mixed_object():
msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
with pytest.raises(TypeError, match=msg):
df.sum(axis=0, min_count=1, numeric_only=False)


def test_timezone_min_max_with_nat():
# GH#27794
df = pd.DataFrame(
{
"A": pd.date_range(start="2018-01-01", end="2018-01-03", tz="UTC"),
"B": pd.date_range(start="2018-01-01", end="2018-01-02", tz="UTC").insert(
2, pd.NaT
),
}
)

expected = pd.Series(
[
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
pd.Timestamp("2018-01-03", tz="UTC"),
],
)
result = df.min(axis=1)
tm.assert_series_equal(result, expected)

expected = pd.Series(
[
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
pd.Timestamp("2018-01-03", tz="UTC"),
],
)
result = df.max(axis=1)
tm.assert_series_equal(result, expected)


def test_min_max_timestamp_timezone_nat():
# GH#44196
rng_with_tz = pd.date_range(
start="2021-10-01T12:00:00+02:00", end="2021-10-02T12:00:00+02:00", freq="4H"
)
df_with_tz = DataFrame(
data={"A": rng_with_tz, "B": rng_with_tz + pd.Timedelta(minutes=20)}
)
df_with_tz.iloc[2, 1] = pd.NaT

result = df_with_tz.max(axis=1)
expected = pd.Series(
[
pd.Timestamp("2021-10-01T12:20:00+02:00"),
pd.Timestamp("2021-10-01T16:20:00+02:00"),
pd.Timestamp("2021-10-01T20:00:00+02:00"),
pd.Timestamp("2021-10-02T00:20:00+02:00"),
pd.Timestamp("2021-10-02T04:20:00+02:00"),
pd.Timestamp("2021-10-02T08:20:00+02:00"),
pd.Timestamp("2021-10-02T12:20:00+02:00"),
]
)
tm.assert_series_equal(result, expected)


def test_timezone_min_max_both_axis():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same bugs probably also affect timedelta64 and PeriodDtype? can you test those too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added tests for this cases but am not sure if I understood you correctly. This bug just appears for timezone aware data, afaik timedelta and PeriodDtype don't have timezone information, therefore I don't expect the bug to exist. Do you mean adding these types to a datetime column?

rng_with_tz = pd.date_range(
start="2021-10-01T12:00:00+02:00", end="2021-10-02T12:00:00+02:00", freq="4H"
)
df_with_tz = DataFrame(
data={"A": rng_with_tz, "B": rng_with_tz + pd.Timedelta(minutes=20)}
)
df_with_tz.iloc[2, 1] = pd.NaT

result = df_with_tz.max(axis=1)
expected = df_with_tz.T.max(axis=0)

tm.assert_series_equal(result, expected)

result = df_with_tz.min(axis=1)
expected = df_with_tz.T.min(axis=0)

tm.assert_series_equal(result, expected)


def test_min_max_timedelta64_nat():
df = DataFrame(
[
[Timedelta(minutes=20), Timedelta(days=2), Timedelta(seconds=3)],
[Timedelta(minutes=2, seconds=2), Timedelta(days=2, minutes=30), pd.NaT],
]
)
expected = pd.Series(
[Timedelta(minutes=2, seconds=2), Timedelta(days=2), Timedelta(seconds=3)]
)
result = df.min(axis=0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.min(axis=0), df.T.min(axis=1))

expected = pd.Series([Timedelta(seconds=3), Timedelta(minutes=2, seconds=2)])
result = df.min(axis=1)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.min(axis=1), df.T.min(axis=0))

expected = pd.Series(
[Timedelta(minutes=20), Timedelta(days=2, minutes=30), Timedelta(seconds=3)]
)
result = df.max(axis=0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.max(axis=0), df.T.max(axis=1))

expected = pd.Series([Timedelta(days=2), Timedelta(days=2, minutes=30)])
result = df.max(axis=1)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.max(axis=1), df.T.max(axis=0))


def test_min_max_perioddtype_nat():
df = DataFrame(
[
[PeriodDtype(freq="20m"), PeriodDtype(freq="1h"), PeriodDtype(freq="1d")],
[PeriodDtype(freq="25m"), PeriodDtype(freq="2h"), pd.NaT],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try period_range, or even just take the frame from the dt64tz case and to a .to_period("D") on it

]
)

expected = Series([])
result = df.min(axis=0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.min(axis=0), df.T.min(axis=1))

expected = Series([])
result = df.min(axis=1)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.min(axis=1), df.T.min(axis=0))

expected = Series([])
result = df.max(axis=0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.max(axis=0), df.T.max(axis=1))

expected = Series([])
result = df.max(axis=1)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(df.max(axis=1), df.T.max(axis=0))
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ def test_cached_range_bug(self):
assert len(rng) == 50
assert rng[0] == datetime(2010, 9, 1, 5)

def test_timezone_comparaison_bug(self):
def test_timezone_comparison_bug(self):
# smoke test
start = Timestamp("20130220 10:00", tz="US/Eastern")
result = date_range(start, periods=2, tz="US/Eastern")
Expand Down