Skip to content

BUG: Min/max does not work for dates with timezones if there are missing values in the data frame #44222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ Missing
^^^^^^^
- Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`)
- Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`)
-
- Bug in :meth:`DataFrame.max`, :meth:`DataFrame.min`, :meth:`Series.max` and :meth:`Series.min` when called on datetime columns with timezone aware data and missing elements (:issue:`27794` and :issue:`44196`)

MultiIndex
^^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10033,6 +10033,7 @@ def _get_data() -> DataFrame:
data = _get_data()
labels = data._get_agg_axis(axis)

# do we need this line?
values = data.values
with np.errstate(all="ignore"):
result = func(values)
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,20 @@ def reduction(
except (AttributeError, TypeError, ValueError):
result = np.nan
else:
result = getattr(values, meth)(axis)
try:
result = getattr(values, meth)(axis)
except TypeError as e:
# the only case when this can happein is for timezone aware
# Timestamps where a NaT value is casted to # np.inf
from pandas.core.dtypes.common import is_float
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just import this at the top


vfunc = np.vectorize(lambda x: is_float(x))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we'd be better off fixing _get_values so as to not incorrectly insert a float

mask = vfunc(values)
if mask.any():
values = np.where(mask, NaT, values)
result = getattr(values, meth)(axis)
else:
raise e

result = _maybe_null_out(result, axis, mask, values.shape)
return result
Expand Down
60 changes: 60 additions & 0 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,63 @@ def weird_func(x):

result = df["decimals"].groupby(df["id1"]).agg(weird_func)
tm.assert_series_equal(result, expected, check_names=False)


def test_timezone_min_max_with_nat():
# GH#27794
df = pd.concat(
[
pd.date_range(start="2018-01-01", end="2018-01-03")
.to_series()
.dt.tz_localize("UTC"),
pd.date_range(start="2018-01-01", end="2018-01-02")
.to_series()
.dt.tz_localize("UTC"),
],
axis=1,
)

expected = pd.Series(
[
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
pd.NaT,
],
index=pd.date_range(start="2018-01-01", end="2018-01-03"),
)
result = df.min(axis=1)
tm.assert_series_equal(result, expected)

expected = pd.Series(
[
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
pd.NaT,
],
index=pd.date_range(start="2018-01-01", end="2018-01-03"),
)
result = df.max(axis=1)
tm.assert_series_equal(result, expected)


def test_min_max_timezone_nat2():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these will go in tests.frame.test_reductions

# GH#44196
rng_with_tz = pd.date_range(
start="2021-10-01T12:00:00+02:00", end="2021-10-02T12:00:00+02:00", freq="4H"
)
df_with_tz = pd.DataFrame(
data={"A": rng_with_tz, "B": rng_with_tz + pd.Timedelta(minutes=20)}
)
df_with_tz.iloc[2, 1] = pd.NaT

result = df_with_tz.max(axis=1)
expected = pd.Series([
pd.Timestamp('2021-10-01T12:20:00+02:00'),
pd.Timestamp('2021-10-01T16:20:00+02:00'),
pd.NaT,
pd.Timestamp('2021-10-02T00:20:00+02:00'),
pd.Timestamp('2021-10-02T04:20:00+02:00'),
pd.Timestamp('2021-10-02T08:20:00+02:00'),
pd.Timestamp('2021-10-02T12:20:00+02:00'),
])
tm.assert_series_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ def test_cached_range_bug(self):
assert len(rng) == 50
assert rng[0] == datetime(2010, 9, 1, 5)

def test_timezone_comparaison_bug(self):
def test_timezone_comparison_bug(self):
# smoke test
start = Timestamp("20130220 10:00", tz="US/Eastern")
result = date_range(start, periods=2, tz="US/Eastern")
Expand Down