From cc5b1787aead7f742ee18beb1931a25ca05bd310 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 27 Jan 2022 11:10:57 -0600
Subject: [PATCH 1/3] fix!: use `pandas.NaT` for missing values in dbdate and
 dbtime dtypes

This makes them consistent with other date/time dtypes, as well as internally
consistent with the advertised `dtype.na_value`.

BREAKING-CHANGE: dbdate and dbtime dtypes return NaT instead of None for missing values

Release-As: 0.4.0
---
 db_dtypes/__init__.py     |   6 +--
 db_dtypes/core.py         |   5 +-
 tests/unit/test_date.py   |  27 ++++++++++
 tests/unit/test_dtypes.py | 104 +++++++++++++++++++++-----------------
 tests/unit/test_time.py   |  30 +++++++++++
 5 files changed, 119 insertions(+), 53 deletions(-)

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index a518a0b..1a250b0 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -145,8 +145,8 @@ def _datetime(
             raise TypeError("Invalid value type", scalar)
 
     def _box_func(self, x):
-        if pandas.isnull(x):
-            return None
+        if pandas.isna(x):
+            return pandas.NaT
 
         try:
             return x.astype("<M8[us]").astype(datetime.datetime).time()
@@ -251,7 +251,7 @@ def _datetime(
 
     def _box_func(self, x):
         if pandas.isnull(x):
-            return None
+            return pandas.NaT
         try:
             return x.astype("<M8[us]").astype(datetime.datetime).date()
         except AttributeError:
diff --git a/db_dtypes/core.py b/db_dtypes/core.py
index 05daf37..a06c6d6 100644
--- a/db_dtypes/core.py
+++ b/db_dtypes/core.py
@@ -16,7 +16,6 @@
 
 import numpy
 import pandas
-from pandas import NaT
 import pandas.api.extensions
 from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype
 
@@ -27,8 +26,8 @@
 
 
 class BaseDatetimeDtype(pandas.api.extensions.ExtensionDtype):
-    na_value = NaT
-    kind = "o"
+    na_value = pandas.NaT
+    kind = "O"
     names = None
 
     @classmethod
diff --git a/tests/unit/test_date.py b/tests/unit/test_date.py
index b906f24..bf877ea 100644
--- a/tests/unit/test_date.py
+++ b/tests/unit/test_date.py
@@ -19,6 +19,7 @@
 
 # To register the types.
 import db_dtypes  # noqa
+from db_dtypes import pandas_backports
 
 
 @pytest.mark.parametrize(
@@ -65,3 +66,29 @@ def test_date_parsing(value, expected):
 def test_date_parsing_errors(value, error):
     with pytest.raises(ValueError, match=error):
         pandas.Series([value], dtype="dbdate")
+
+
+@pytest.mark.skipif(
+    not hasattr(pandas_backports, "numpy_validate_median"),
+    reason="median not available with this version of pandas",
+)
+@pytest.mark.parametrize(
+    "values, expected",
+    [
+        (["1970-01-01", "1900-01-01", "2000-01-01"], datetime.date(1970, 1, 1)),
+        (
+            [
+                None,
+                "1900-01-01",
+                pandas.NA if hasattr(pandas, "NA") else None,
+                pandas.NaT,
+                float("nan"),
+            ],
+            datetime.date(1900, 1, 1),
+        ),
+        (["2222-02-01", "2222-02-03"], datetime.date(2222, 2, 2)),
+    ],
+)
+def test_date_median(values, expected):
+    series = pandas.Series(values, dtype="dbdate")
+    assert series.median() == expected
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index aacbf0b..f65f16d 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -23,8 +23,8 @@
 pandas_release = packaging.version.parse(pd.__version__).release
 
 SAMPLE_RAW_VALUES = dict(
-    dbdate=(datetime.date(2021, 2, 2), "2021-2-3", None),
-    dbtime=(datetime.time(1, 2, 2), "1:2:3.5", None),
+    dbdate=(datetime.date(2021, 2, 2), "2021-2-3", pd.NaT),
+    dbtime=(datetime.time(1, 2, 2), "1:2:3.5", pd.NaT),
 )
 SAMPLE_VALUES = dict(
     dbdate=(
@@ -90,7 +90,7 @@ def test_array_construction(dtype, factory_method):
         factory = getattr(factory, factory_method)
         if factory_method == "_from_sequence_of_strings":
             sample_raw_values = [
-                str(v) if v is not None else v for v in sample_raw_values
+                str(v) if not pd.isna(v) else v for v in sample_raw_values
             ]
     a = factory(sample_raw_values)
     assert len(a) == 3
@@ -98,11 +98,11 @@ def test_array_construction(dtype, factory_method):
     assert a.shape == (3,)
     sample_values = SAMPLE_VALUES[dtype]
     assert a[0], a[1] == sample_values[:2]
-    assert a[2] is None
+    assert pd.isna(a[2]) and a[2] is pd.NaT
 
     # implementation details:
     assert a.nbytes == 24
-    assert np.array_equal(
+    np.testing.assert_array_equal(
         a._ndarray
         == np.array(SAMPLE_DT_VALUES[dtype][:2] + ("NaT",), dtype="datetime64[us]"),
         [True, True, False],
@@ -121,7 +121,7 @@ def test_time_series_construction(dtype):
     s = pd.Series(SAMPLE_RAW_VALUES[dtype], dtype=dtype)
     assert len(s) == 3
     assert s[0], s[1] == sample_values[:2]
-    assert s[2] is None
+    assert s[2] is pd.NaT
     assert s.nbytes == 24
     assert isinstance(s.array, _cls(dtype))
 
@@ -166,8 +166,8 @@ def test_timearray_comparisons(
         # Note that the right_obs comparisons work because
         # they're called on right_obs rather then left, because
         # TimeArrays only support comparisons with TimeArrays.
-        assert np.array_equal(comparisons[op](left, r), expected)
-        assert np.array_equal(complements[op](left, r), ~expected)
+        np.testing.assert_array_equal(comparisons[op](left, r), expected)
+        np.testing.assert_array_equal(complements[op](left, r), ~expected)
 
     # Bad shape
     for bad_shape in ([], [1, 2, 3]):
@@ -186,10 +186,10 @@ def test_timearray_comparisons(
         [1],  # a single-element array gets broadcast
     ):
         if op == "==":
-            assert np.array_equal(
+            np.testing.assert_array_equal(
                 comparisons[op](left, np.array(bad_items)), np.array([False, False])
             )
-            assert np.array_equal(
+            np.testing.assert_array_equal(
                 complements[op](left, np.array(bad_items)), np.array([True, True])
             )
         else:
@@ -204,7 +204,7 @@ def test_timearray_comparisons(
 def test___getitem___arrayindex(dtype):
     cls = _cls(dtype)
     sample_values = SAMPLE_VALUES[dtype]
-    assert np.array_equal(
+    np.testing.assert_array_equal(
         cls(sample_values)[[1, 3]], cls([sample_values[1], sample_values[3]]),
     )
 
@@ -215,21 +215,23 @@ def test_timearray_slicing(dtype):
     b = a[:]
     assert b is not a
     assert b.__class__ == a.__class__
-    assert np.array_equal(b, a)
+    np.testing.assert_array_equal(b._ndarray, a._ndarray)
 
     sample_values = SAMPLE_VALUES[dtype]
     cls = _cls(dtype)
-    assert np.array_equal(a[:1], cls._from_sequence(sample_values[:1]))
+    np.testing.assert_array_equal(
+        a[:1]._ndarray, cls._from_sequence(sample_values[:1])._ndarray
+    )
 
     # Assignment works:
     a[:1] = cls._from_sequence([sample_values[2]])
-    assert np.array_equal(
+    np.testing.assert_array_equal(
         a[:2], cls._from_sequence([sample_values[2], sample_values[1]])
     )
 
     # Series also work:
     s = pd.Series(SAMPLE_RAW_VALUES[dtype], dtype=dtype)
-    assert np.array_equal(s[:1].array, cls._from_sequence([sample_values[0]]))
+    np.testing.assert_array_equal(s[:1].array, cls._from_sequence([sample_values[0]]))
 
 
 @for_date_and_time
@@ -238,9 +240,13 @@ def test_item_assignment(dtype):
     sample_values = SAMPLE_VALUES[dtype]
     cls = _cls(dtype)
     a[0] = sample_values[2]
-    assert np.array_equal(a, cls._from_sequence([sample_values[2], sample_values[1]]))
+    np.testing.assert_array_equal(
+        a, cls._from_sequence([sample_values[2], sample_values[1]])
+    )
     a[1] = None
-    assert np.array_equal(a, cls._from_sequence([sample_values[2], None]))
+    np.testing.assert_array_equal(
+        a._ndarray, cls._from_sequence([sample_values[2], None])._ndarray
+    )
 
 
 @for_date_and_time
@@ -249,9 +255,9 @@ def test_array_assignment(dtype):
     cls = _cls(dtype)
     sample_values = SAMPLE_VALUES[dtype]
     a[a.isna()] = sample_values[3]
-    assert np.array_equal(a, cls([sample_values[i] for i in (0, 1, 3)]))
+    np.testing.assert_array_equal(a, cls([sample_values[i] for i in (0, 1, 3)]))
     a[[0, 2]] = sample_values[2]
-    assert np.array_equal(a, cls([sample_values[i] for i in (2, 1, 2)]))
+    np.testing.assert_array_equal(a, cls([sample_values[i] for i in (2, 1, 2)]))
 
 
 @for_date_and_time
@@ -270,7 +276,7 @@ def test_copy(dtype):
     b = a.copy()
     assert b is not a
     assert b._ndarray is not a._ndarray
-    assert np.array_equal(b, a)
+    np.testing.assert_array_equal(b, a)
 
 
 @for_date_and_time
@@ -280,7 +286,7 @@ def test_from_ndarray_copy(dtype):
     a = cls._from_sequence(sample_values)
     b = cls(a._ndarray, copy=True)
     assert b._ndarray is not a._ndarray
-    assert np.array_equal(b, a)
+    np.testing.assert_array_equal(b, a)
 
 
 @for_date_and_time
@@ -310,7 +316,7 @@ def test__validate_scalar_invalid(dtype):
     [
         (False, None),
         (True, None),
-        (True, pd._libs.NaT if pd else None),
+        (True, pd.NaT if pd else None),
         (True, np.NaN if pd else None),
         (True, 42),
     ],
@@ -326,7 +332,7 @@ def test_take(dtype, allow_fill, fill_value):
                 else datetime.time(0, 42, 42, 424242)
             )
         else:
-            expected_fill = None
+            expected_fill = pd.NaT
         b = a.take([1, -1, 3], allow_fill=True, fill_value=fill_value)
         expect = [sample_values[1], expected_fill, sample_values[3]]
     else:
@@ -370,7 +376,7 @@ def test__concat_same_type_not_same_type(dtype):
 
 @for_date_and_time
 def test_dropna(dtype):
-    assert np.array_equal(_make_one(dtype).dropna(), _make_one(dtype)[:2])
+    np.testing.assert_array_equal(_make_one(dtype).dropna(), _make_one(dtype)[:2])
 
 
 @pytest.mark.parametrize(
@@ -398,14 +404,18 @@ def test_fillna(dtype, value, meth, limit, expect):
     elif value is not None:
         value = sample_values[value]
     expect = cls([None if i is None else sample_values[i] for i in expect])
-    assert np.array_equal(a.fillna(value, meth, limit), expect)
+    np.testing.assert_array_equal(
+        a.fillna(value, meth, limit)._ndarray, expect._ndarray
+    )
 
 
 @for_date_and_time
 def test_unique(dtype):
     cls = _cls(dtype)
     sample_values = SAMPLE_VALUES[dtype]
-    assert np.array_equal(cls(sample_values * 3).unique(), cls(sample_values),)
+    np.testing.assert_array_equal(
+        cls(sample_values * 3).unique(), cls(sample_values),
+    )
 
 
 @for_date_and_time
@@ -421,7 +431,7 @@ def test_astype_copy(dtype):
     b = a.astype(a.dtype, copy=True)
     assert b is not a
     assert b.__class__ is a.__class__
-    assert np.array_equal(b, a)
+    np.testing.assert_array_equal(b._ndarray, a._ndarray)
 
 
 @pytest.mark.parametrize(
@@ -452,7 +462,7 @@ def test_asdatetime(dtype, same):
 
         b = a.astype(dt, copy=copy)
         assert b is not a._ndarray
-        assert np.array_equal(b[:2], a._ndarray[:2])
+        np.testing.assert_array_equal(b[:2], a._ndarray[:2])
         assert pd.isna(b[2]) and str(b[2]) == "NaT"
 
 
@@ -482,7 +492,7 @@ def test_astimedelta(dtype):
 
     a = _cls("dbtime")([t, None])
     b = a.astype(dtype)
-    np.array_equal(b[:1], expect)
+    np.testing.assert_array_equal(b[:1], expect)
     assert pd.isna(b[1]) and str(b[1]) == "NaT"
 
 
@@ -531,21 +541,21 @@ def test_min_max_median(dtype):
             )
 
     empty = cls([])
-    assert empty.min() is None
-    assert empty.max() is None
+    assert empty.min() is pd.NaT
+    assert empty.max() is pd.NaT
     if pandas_release >= (1, 2):
-        assert empty.median() is None
+        assert empty.median() is pd.NaT
     empty = cls([None])
-    assert empty.min() is None
-    assert empty.max() is None
-    assert empty.min(skipna=False) is None
-    assert empty.max(skipna=False) is None
+    assert empty.min() is pd.NaT
+    assert empty.max() is pd.NaT
+    assert empty.min(skipna=False) is pd.NaT
+    assert empty.max(skipna=False) is pd.NaT
     if pandas_release >= (1, 2):
         with pytest.warns(RuntimeWarning, match="empty slice"):
             # It's weird that we get the warning here, and not
             # below. :/
-            assert empty.median() is None
-        assert empty.median(skipna=False) is None
+            assert empty.median() is pd.NaT
+        assert empty.median(skipna=False) is pd.NaT
 
     a = _make_one(dtype)
     assert a.min() == sample_values[0]
@@ -563,14 +573,14 @@ def test_date_add():
     times = _cls("dbtime")(SAMPLE_VALUES["dbtime"])
     expect = dates.astype("datetime64") + times.astype("timedelta64")
 
-    assert np.array_equal(dates + times, expect)
-    assert np.array_equal(times + dates, expect)
+    np.testing.assert_array_equal(dates + times, expect)
+    np.testing.assert_array_equal(times + dates, expect)
 
     do = pd.DateOffset(days=1)
     expect = dates.astype("object") + do
-    assert np.array_equal(dates + do, expect)
+    np.testing.assert_array_equal(dates + do, expect)
     if pandas_release >= (1, 1):
-        assert np.array_equal(do + dates, expect)
+        np.testing.assert_array_equal(do + dates, expect)
 
     with pytest.raises(TypeError):
         dates + times.astype("timedelta64")
@@ -587,8 +597,8 @@ def test_date_add():
 
     do = pd.Series([pd.DateOffset(days=i) for i in range(4)])
     expect = dates.astype("object") + do
-    assert np.array_equal(dates + do, expect)
-    assert np.array_equal(do + dates, expect)
+    np.testing.assert_array_equal(dates + do, expect)
+    np.testing.assert_array_equal(do + dates, expect)
 
 
 def test_date_sub():
@@ -602,11 +612,11 @@ def test_date_sub():
         )
     )
     expect = dates.astype("datetime64") - dates2.astype("datetime64")
-    assert np.array_equal(dates - dates2, expect)
+    np.testing.assert_array_equal(dates - dates2, expect)
 
     do = pd.DateOffset(days=1)
     expect = dates.astype("object") - do
-    assert np.array_equal(dates - do, expect)
+    np.testing.assert_array_equal(dates - do, expect)
 
     with pytest.raises(TypeError):
         dates - 42
@@ -620,4 +630,4 @@ def test_date_sub():
 
     do = pd.Series([pd.DateOffset(days=i) for i in range(4)])
     expect = dates.astype("object") - do
-    assert np.array_equal(dates - do, expect)
+    np.testing.assert_array_equal(dates - do, expect)
diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py
index ba45949..8ecb996 100644
--- a/tests/unit/test_time.py
+++ b/tests/unit/test_time.py
@@ -19,6 +19,7 @@
 
 # To register the types.
 import db_dtypes  # noqa
+from db_dtypes import pandas_backports
 
 
 @pytest.mark.parametrize(
@@ -82,3 +83,32 @@ def test_time_parsing(value, expected):
 def test_time_parsing_errors(value, error):
     with pytest.raises(ValueError, match=error):
         pandas.Series([value], dtype="dbtime")
+
+
+@pytest.mark.skipif(
+    not hasattr(pandas_backports, "numpy_validate_median"),
+    reason="median not available with this version of pandas",
+)
+@pytest.mark.parametrize(
+    "values, expected",
+    [
+        (
+            ["00:00:00", "12:34:56.789101", "23:59:59.999999"],
+            datetime.time(12, 34, 56, 789101),
+        ),
+        (
+            [
+                None,
+                "06:30:00",
+                pandas.NA if hasattr(pandas, "NA") else None,
+                pandas.NaT,
+                float("nan"),
+            ],
+            datetime.time(6, 30),
+        ),
+        (["2:22:21.222222", "2:22:23.222222"], datetime.time(2, 22, 22, 222222)),
+    ],
+)
+def test_date_median(values, expected):
+    series = pandas.Series(values, dtype="dbtime")
+    assert series.median() == expected

From 39c4a67d16664deb6b1e707e1d0b9a339aa59f05 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 2 Feb 2022 16:09:16 -0600
Subject: [PATCH 2/3] adjust pandas version support for median

---
 db_dtypes/pandas_backports.py | 2 +-
 testing/constraints-3.9.txt   | 3 ++-
 tests/unit/test_dtypes.py     | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/db_dtypes/pandas_backports.py b/db_dtypes/pandas_backports.py
index 4b733cc..f53adff 100644
--- a/db_dtypes/pandas_backports.py
+++ b/db_dtypes/pandas_backports.py
@@ -42,7 +42,7 @@
 numpy_validate_max = pandas.compat.numpy.function.validate_max
 numpy_validate_min = pandas.compat.numpy.function.validate_min
 
-if pandas_release >= (1, 2):
+if pandas_release >= (1, 3):
     nanmedian = pandas.core.nanops.nanmedian
     numpy_validate_median = pandas.compat.numpy.function.validate_median
 
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index eebb9da..d814dcd 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -1 +1,2 @@
-sqlalchemy>=1.4.13
+# Make sure we test with pandas 1.3.0. The Python version isn't that relevant.
+pandas==1.3.0
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index f65f16d..66074d8 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -533,7 +533,7 @@ def test_min_max_median(dtype):
         a = cls(data)
         assert a.min() == sample_values[0]
         assert a.max() == sample_values[-1]
-        if pandas_release >= (1, 2):
+        if pandas_release >= (1, 3):
             assert (
                 a.median() == datetime.time(1, 2, 4)
                 if dtype == "dbtime"
@@ -543,14 +543,14 @@ def test_min_max_median(dtype):
     empty = cls([])
     assert empty.min() is pd.NaT
     assert empty.max() is pd.NaT
-    if pandas_release >= (1, 2):
+    if pandas_release >= (1, 3):
         assert empty.median() is pd.NaT
     empty = cls([None])
     assert empty.min() is pd.NaT
     assert empty.max() is pd.NaT
     assert empty.min(skipna=False) is pd.NaT
     assert empty.max(skipna=False) is pd.NaT
-    if pandas_release >= (1, 2):
+    if pandas_release >= (1, 3):
         with pytest.warns(RuntimeWarning, match="empty slice"):
             # It's weird that we get the warning here, and not
             # below. :/
@@ -560,7 +560,7 @@ def test_min_max_median(dtype):
     a = _make_one(dtype)
     assert a.min() == sample_values[0]
     assert a.max() == sample_values[1]
-    if pandas_release >= (1, 2):
+    if pandas_release >= (1, 3):
         assert (
             a.median() == datetime.time(1, 2, 2, 750000)
             if dtype == "dbtime"

From 6bb1c4e558266f5d4108e1698de9ddd6b1d56950 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 2 Feb 2022 16:10:49 -0600
Subject: [PATCH 3/3] consistent use of isna

---
 db_dtypes/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index 1a250b0..a222e6d 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -250,7 +250,7 @@ def _datetime(
             raise TypeError("Invalid value type", scalar)
 
     def _box_func(self, x):
-        if pandas.isnull(x):
+        if pandas.isna(x):
             return pandas.NaT
         try:
             return x.astype("<M8[us]").astype(datetime.datetime).date()