From e8ac38aa661216cd5ed89dbe8afd0cb3c58f4535 Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Mon, 1 Nov 2021 23:33:12 -0500 Subject: [PATCH 1/7] BUG: Add fix for hashing timestamps with folds --- pandas/_libs/tslibs/timestamps.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 304ac9405c5e1..03ee62e59aa3d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -180,6 +180,8 @@ cdef class _Timestamp(ABCTimestamp): def __hash__(_Timestamp self): if self.nanosecond: return hash(self.value) + if self.fold: + return datetime.__hash__(self.replace(fold=0)) return datetime.__hash__(self) def __richcmp__(_Timestamp self, object other, int op): From 0fc3b1b7d636c547360955488bd8912ccccf82be Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Wed, 29 Dec 2021 14:13:37 -0600 Subject: [PATCH 2/7] Add unit test for fold hashing fix --- pandas/tests/tslibs/test_timezones.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 7ab0ad0856af0..16ffd7977a66e 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -166,3 +166,24 @@ def test_maybe_get_tz_offset_only(): tz = timezones.maybe_get_tz("UTC-02:45") assert tz == timezone(-timedelta(hours=2, minutes=45)) + + +def test_hash_timestamp_with_fold(): + # see gh-33931 + america_chicago = dateutil.tz.gettz("America/Chicago") + transition_1 = Timestamp( + year=2013, month=11, day=3, hour=1, minute=0, tzinfo=america_chicago + ) + transition_2 = Timestamp( + year=2013, month=11, day=3, hour=1, minute=0, fold=1, tzinfo=america_chicago + ) + assert hash(transition_1) == hash(transition_2) + + america_santiago = dateutil.tz.gettz("America/Santiago") + transition_3 = Timestamp( + year=2021, month=4, day=3, hour=23, minute=0, tz=america_santiago + ) + transition_4 = Timestamp( + year=2021, month=4, day=3, hour=23, minute=0, fold=1, tz=america_santiago + ) + assert hash(transition_3) == hash(transition_4) From 1547c0c57b6f1cb1bc46d6e824b4d772071dd227 Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Wed, 29 Dec 2021 22:21:46 -0600 Subject: [PATCH 3/7] Move test from timezone to timestamp tests Parametrize the unit test --- .../tests/scalar/timestamp/test_timestamp.py | 27 +++++++++++++++++++ pandas/tests/tslibs/test_timezones.py | 21 --------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 008731b13172e..4639a24d019fe 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -452,6 +452,33 @@ def test_hash_equivalent(self): stamp = Timestamp(datetime(2011, 1, 1)) assert d[stamp] == 5 + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_hash_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-33931 + test_timezone = gettz(timezone) + transition_1 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + assert hash(transition_1) == hash(transition_2) + def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 with tm.assert_produces_warning(FutureWarning, match="freq"): diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 16ffd7977a66e..7ab0ad0856af0 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -166,24 +166,3 @@ def test_maybe_get_tz_offset_only(): tz = timezones.maybe_get_tz("UTC-02:45") assert tz == timezone(-timedelta(hours=2, minutes=45)) - - -def test_hash_timestamp_with_fold(): - # see gh-33931 - america_chicago = dateutil.tz.gettz("America/Chicago") - transition_1 = Timestamp( - year=2013, month=11, day=3, hour=1, minute=0, tzinfo=america_chicago - ) - transition_2 = Timestamp( - year=2013, month=11, day=3, hour=1, minute=0, fold=1, tzinfo=america_chicago - ) - assert hash(transition_1) == hash(transition_2) - - america_santiago = dateutil.tz.gettz("America/Santiago") - transition_3 = Timestamp( - year=2021, month=4, day=3, hour=23, minute=0, tz=america_santiago - ) - transition_4 = Timestamp( - year=2021, month=4, day=3, hour=23, minute=0, fold=1, tz=america_santiago - ) - assert hash(transition_3) == hash(transition_4) From c6400d66bff1f38ece09310d8ca5bb0ff18ab162 Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Thu, 30 Dec 2021 12:49:46 -0600 Subject: [PATCH 4/7] Add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3924191bebcfd..157b895823a16 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -712,6 +712,7 @@ Datetimelike - Bug in :class:`DateOffset`` addition with :class:`Timestamp` where ``offset.nanoseconds`` would not be included in the result (:issue:`43968`, :issue:`36589`) - Bug in :meth:`Timestamp.fromtimestamp` not supporting the ``tz`` argument (:issue:`45083`) - Bug in :class:`DataFrame` construction from dict of :class:`Series` with mismatched index dtypes sometimes raising depending on the ordering of the passed dict (:issue:`44091`) +- Bug in :class:`Timestamp` hashing during some DST transitions caused a segmentation fault (:issue:`33931`) - Timedelta From cc77ce240ead4e6eff45d0bf74525acef725947e Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Fri, 31 Dec 2021 10:46:43 -0600 Subject: [PATCH 5/7] Add reindex issue to whatsnew Add unit test for reindexing with fold --- doc/source/whatsnew/v1.4.0.rst | 2 +- .../tests/scalar/timestamp/test_timestamp.py | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 157b895823a16..b93b210078a75 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -712,7 +712,7 @@ Datetimelike - Bug in :class:`DateOffset`` addition with :class:`Timestamp` where ``offset.nanoseconds`` would not be included in the result (:issue:`43968`, :issue:`36589`) - Bug in :meth:`Timestamp.fromtimestamp` not supporting the ``tz`` argument (:issue:`45083`) - Bug in :class:`DataFrame` construction from dict of :class:`Series` with mismatched index dtypes sometimes raising depending on the ordering of the passed dict (:issue:`44091`) -- Bug in :class:`Timestamp` hashing during some DST transitions caused a segmentation fault (:issue:`33931`) +- Bug in :class:`Timestamp` hashing during some DST transitions caused a segmentation fault (:issue:`33931` and :issue:`40817`) - Timedelta diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4639a24d019fe..38ec4d0710231 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -25,6 +25,7 @@ import pandas.util._test_decorators as td from pandas import ( + DataFrame, NaT, Timedelta, Timestamp, @@ -479,6 +480,39 @@ def test_hash_timestamp_with_fold(self, timezone, year, month, day, hour): ) assert hash(transition_1) == hash(transition_2) + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-40817 + test_timezone = gettz(timezone) + transition_1 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + df = ( + DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]}) + .set_index("index") + .reindex(["1", "2"]) + ) + assert df.index.values[0] == "1" + assert df.index.values[1] == "2" + def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 with tm.assert_produces_warning(FutureWarning, match="freq"): From 2bce9eadd170b1cb09cfb14f011974bd86577b0c Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Fri, 31 Dec 2021 15:23:54 -0600 Subject: [PATCH 6/7] Change test to use assert_frame_equal --- pandas/tests/scalar/timestamp/test_timestamp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 38ec4d0710231..0e3d03ff83c12 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -510,8 +510,10 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): .set_index("index") .reindex(["1", "2"]) ) - assert df.index.values[0] == "1" - assert df.index.values[1] == "2" + tm.assert_frame_equal( + df, + DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"), + ) def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 From c0c160b93dc684f93e489d178f114ebe0497230d Mon Sep 17 00:00:00 2001 From: JonWiggins Date: Fri, 31 Dec 2021 20:47:06 -0600 Subject: [PATCH 7/7] Move folded reindex test to test_reindex --- pandas/tests/frame/methods/test_reindex.py | 37 +++++++++++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 36 ------------------ 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bee8025275b42..b70ceea845ee8 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + import pandas as pd from pandas import ( Categorical, @@ -78,6 +80,41 @@ def test_setitem_reset_index_dtypes(self): result = df2.reset_index() tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-40817 + test_timezone = gettz(timezone) + transition_1 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + df = ( + DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]}) + .set_index("index") + .reindex(["1", "2"]) + ) + tm.assert_frame_equal( + df, + DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"), + ) + class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 0e3d03ff83c12..4639a24d019fe 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -25,7 +25,6 @@ import pandas.util._test_decorators as td from pandas import ( - DataFrame, NaT, Timedelta, Timestamp, @@ -480,41 +479,6 @@ def test_hash_timestamp_with_fold(self, timezone, year, month, day, hour): ) assert hash(transition_1) == hash(transition_2) - @pytest.mark.parametrize( - "timezone, year, month, day, hour", - [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], - ) - def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): - # see gh-40817 - test_timezone = gettz(timezone) - transition_1 = Timestamp( - year=year, - month=month, - day=day, - hour=hour, - minute=0, - fold=0, - tzinfo=test_timezone, - ) - transition_2 = Timestamp( - year=year, - month=month, - day=day, - hour=hour, - minute=0, - fold=1, - tzinfo=test_timezone, - ) - df = ( - DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]}) - .set_index("index") - .reindex(["1", "2"]) - ) - tm.assert_frame_equal( - df, - DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"), - ) - def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 with tm.assert_produces_warning(FutureWarning, match="freq"):