From cc5b1787aead7f742ee18beb1931a25ca05bd310 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 27 Jan 2022 11:10:57 -0600 Subject: [PATCH 1/3] fix!: use `pandas.NaT` for missing values in dbdate and dbtime dtypes This makes them consistent with other date/time dtypes, as well as internally consistent with the advertised `dtype.na_value`. BREAKING-CHANGE: dbdate and dbtime dtypes return NaT instead of None for missing values Release-As: 0.4.0 --- db_dtypes/__init__.py | 6 +-- db_dtypes/core.py | 5 +- tests/unit/test_date.py | 27 ++++++++++ tests/unit/test_dtypes.py | 104 +++++++++++++++++++++----------------- tests/unit/test_time.py | 30 +++++++++++ 5 files changed, 119 insertions(+), 53 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index a518a0b..1a250b0 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -145,8 +145,8 @@ def _datetime( raise TypeError("Invalid value type", scalar) def _box_func(self, x): - if pandas.isnull(x): - return None + if pandas.isna(x): + return pandas.NaT try: return x.astype("= (1, 2): - assert empty.median() is None + assert empty.median() is pd.NaT empty = cls([None]) - assert empty.min() is None - assert empty.max() is None - assert empty.min(skipna=False) is None - assert empty.max(skipna=False) is None + assert empty.min() is pd.NaT + assert empty.max() is pd.NaT + assert empty.min(skipna=False) is pd.NaT + assert empty.max(skipna=False) is pd.NaT if pandas_release >= (1, 2): with pytest.warns(RuntimeWarning, match="empty slice"): # It's weird that we get the warning here, and not # below. :/ - assert empty.median() is None - assert empty.median(skipna=False) is None + assert empty.median() is pd.NaT + assert empty.median(skipna=False) is pd.NaT a = _make_one(dtype) assert a.min() == sample_values[0] @@ -563,14 +573,14 @@ def test_date_add(): times = _cls("dbtime")(SAMPLE_VALUES["dbtime"]) expect = dates.astype("datetime64") + times.astype("timedelta64") - assert np.array_equal(dates + times, expect) - assert np.array_equal(times + dates, expect) + np.testing.assert_array_equal(dates + times, expect) + np.testing.assert_array_equal(times + dates, expect) do = pd.DateOffset(days=1) expect = dates.astype("object") + do - assert np.array_equal(dates + do, expect) + np.testing.assert_array_equal(dates + do, expect) if pandas_release >= (1, 1): - assert np.array_equal(do + dates, expect) + np.testing.assert_array_equal(do + dates, expect) with pytest.raises(TypeError): dates + times.astype("timedelta64") @@ -587,8 +597,8 @@ def test_date_add(): do = pd.Series([pd.DateOffset(days=i) for i in range(4)]) expect = dates.astype("object") + do - assert np.array_equal(dates + do, expect) - assert np.array_equal(do + dates, expect) + np.testing.assert_array_equal(dates + do, expect) + np.testing.assert_array_equal(do + dates, expect) def test_date_sub(): @@ -602,11 +612,11 @@ def test_date_sub(): ) ) expect = dates.astype("datetime64") - dates2.astype("datetime64") - assert np.array_equal(dates - dates2, expect) + np.testing.assert_array_equal(dates - dates2, expect) do = pd.DateOffset(days=1) expect = dates.astype("object") - do - assert np.array_equal(dates - do, expect) + np.testing.assert_array_equal(dates - do, expect) with pytest.raises(TypeError): dates - 42 @@ -620,4 +630,4 @@ def test_date_sub(): do = pd.Series([pd.DateOffset(days=i) for i in range(4)]) expect = dates.astype("object") - do - assert np.array_equal(dates - do, expect) + np.testing.assert_array_equal(dates - do, expect) diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py index ba45949..8ecb996 100644 --- a/tests/unit/test_time.py +++ b/tests/unit/test_time.py @@ -19,6 +19,7 @@ # To register the types. import db_dtypes # noqa +from db_dtypes import pandas_backports @pytest.mark.parametrize( @@ -82,3 +83,32 @@ def test_time_parsing(value, expected): def test_time_parsing_errors(value, error): with pytest.raises(ValueError, match=error): pandas.Series([value], dtype="dbtime") + + +@pytest.mark.skipif( + not hasattr(pandas_backports, "numpy_validate_median"), + reason="median not available with this version of pandas", +) +@pytest.mark.parametrize( + "values, expected", + [ + ( + ["00:00:00", "12:34:56.789101", "23:59:59.999999"], + datetime.time(12, 34, 56, 789101), + ), + ( + [ + None, + "06:30:00", + pandas.NA if hasattr(pandas, "NA") else None, + pandas.NaT, + float("nan"), + ], + datetime.time(6, 30), + ), + (["2:22:21.222222", "2:22:23.222222"], datetime.time(2, 22, 22, 222222)), + ], +) +def test_date_median(values, expected): + series = pandas.Series(values, dtype="dbtime") + assert series.median() == expected From 39c4a67d16664deb6b1e707e1d0b9a339aa59f05 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 2 Feb 2022 16:09:16 -0600 Subject: [PATCH 2/3] adjust pandas version support for median --- db_dtypes/pandas_backports.py | 2 +- testing/constraints-3.9.txt | 3 ++- tests/unit/test_dtypes.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/db_dtypes/pandas_backports.py b/db_dtypes/pandas_backports.py index 4b733cc..f53adff 100644 --- a/db_dtypes/pandas_backports.py +++ b/db_dtypes/pandas_backports.py @@ -42,7 +42,7 @@ numpy_validate_max = pandas.compat.numpy.function.validate_max numpy_validate_min = pandas.compat.numpy.function.validate_min -if pandas_release >= (1, 2): +if pandas_release >= (1, 3): nanmedian = pandas.core.nanops.nanmedian numpy_validate_median = pandas.compat.numpy.function.validate_median diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index eebb9da..d814dcd 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -1 +1,2 @@ -sqlalchemy>=1.4.13 +# Make sure we test with pandas 1.3.0. The Python version isn't that relevant. +pandas==1.3.0 diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index f65f16d..66074d8 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -533,7 +533,7 @@ def test_min_max_median(dtype): a = cls(data) assert a.min() == sample_values[0] assert a.max() == sample_values[-1] - if pandas_release >= (1, 2): + if pandas_release >= (1, 3): assert ( a.median() == datetime.time(1, 2, 4) if dtype == "dbtime" @@ -543,14 +543,14 @@ def test_min_max_median(dtype): empty = cls([]) assert empty.min() is pd.NaT assert empty.max() is pd.NaT - if pandas_release >= (1, 2): + if pandas_release >= (1, 3): assert empty.median() is pd.NaT empty = cls([None]) assert empty.min() is pd.NaT assert empty.max() is pd.NaT assert empty.min(skipna=False) is pd.NaT assert empty.max(skipna=False) is pd.NaT - if pandas_release >= (1, 2): + if pandas_release >= (1, 3): with pytest.warns(RuntimeWarning, match="empty slice"): # It's weird that we get the warning here, and not # below. :/ @@ -560,7 +560,7 @@ def test_min_max_median(dtype): a = _make_one(dtype) assert a.min() == sample_values[0] assert a.max() == sample_values[1] - if pandas_release >= (1, 2): + if pandas_release >= (1, 3): assert ( a.median() == datetime.time(1, 2, 2, 750000) if dtype == "dbtime" From 6bb1c4e558266f5d4108e1698de9ddd6b1d56950 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 2 Feb 2022 16:10:49 -0600 Subject: [PATCH 3/3] consistent use of isna --- db_dtypes/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 1a250b0..a222e6d 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -250,7 +250,7 @@ def _datetime( raise TypeError("Invalid value type", scalar) def _box_func(self, x): - if pandas.isnull(x): + if pandas.isna(x): return pandas.NaT try: return x.astype("