Skip to content

fix!: use pandas.NaT for missing values in dbdate and dbtime dtypes #67

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions db_dtypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def _datetime(
raise TypeError("Invalid value type", scalar)

def _box_func(self, x):
if pandas.isnull(x):
return None
if pandas.isna(x):
return pandas.NaT

try:
return x.astype("<M8[us]").astype(datetime.datetime).time()
Expand Down Expand Up @@ -251,7 +251,7 @@ def _datetime(

def _box_func(self, x):
if pandas.isnull(x):
return None
return pandas.NaT
try:
return x.astype("<M8[us]").astype(datetime.datetime).date()
except AttributeError:
Expand Down
5 changes: 2 additions & 3 deletions db_dtypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import numpy
import pandas
from pandas import NaT
import pandas.api.extensions
from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype

Expand All @@ -27,8 +26,8 @@


class BaseDatetimeDtype(pandas.api.extensions.ExtensionDtype):
na_value = NaT
kind = "o"
na_value = pandas.NaT
kind = "O"
names = None

@classmethod
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# To register the types.
import db_dtypes # noqa
from db_dtypes import pandas_backports


@pytest.mark.parametrize(
Expand Down Expand Up @@ -65,3 +66,29 @@ def test_date_parsing(value, expected):
def test_date_parsing_errors(value, error):
with pytest.raises(ValueError, match=error):
pandas.Series([value], dtype="dbdate")


@pytest.mark.skipif(
not hasattr(pandas_backports, "numpy_validate_median"),
reason="median not available with this version of pandas",
)
@pytest.mark.parametrize(
"values, expected",
[
(["1970-01-01", "1900-01-01", "2000-01-01"], datetime.date(1970, 1, 1)),
(
[
None,
"1900-01-01",
pandas.NA if hasattr(pandas, "NA") else None,
pandas.NaT,
float("nan"),
],
datetime.date(1900, 1, 1),
),
(["2222-02-01", "2222-02-03"], datetime.date(2222, 2, 2)),
],
)
def test_date_median(values, expected):
series = pandas.Series(values, dtype="dbdate")
assert series.median() == expected
104 changes: 57 additions & 47 deletions tests/unit/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
pandas_release = packaging.version.parse(pd.__version__).release

SAMPLE_RAW_VALUES = dict(
dbdate=(datetime.date(2021, 2, 2), "2021-2-3", None),
dbtime=(datetime.time(1, 2, 2), "1:2:3.5", None),
dbdate=(datetime.date(2021, 2, 2), "2021-2-3", pd.NaT),
dbtime=(datetime.time(1, 2, 2), "1:2:3.5", pd.NaT),
)
SAMPLE_VALUES = dict(
dbdate=(
Expand Down Expand Up @@ -90,19 +90,19 @@ def test_array_construction(dtype, factory_method):
factory = getattr(factory, factory_method)
if factory_method == "_from_sequence_of_strings":
sample_raw_values = [
str(v) if v is not None else v for v in sample_raw_values
str(v) if not pd.isna(v) else v for v in sample_raw_values
]
a = factory(sample_raw_values)
assert len(a) == 3
assert a.size == 3
assert a.shape == (3,)
sample_values = SAMPLE_VALUES[dtype]
assert a[0], a[1] == sample_values[:2]
assert a[2] is None
assert pd.isna(a[2]) and a[2] is pd.NaT

# implementation details:
assert a.nbytes == 24
assert np.array_equal(
np.testing.assert_array_equal(
a._ndarray
== np.array(SAMPLE_DT_VALUES[dtype][:2] + ("NaT",), dtype="datetime64[us]"),
[True, True, False],
Expand All @@ -121,7 +121,7 @@ def test_time_series_construction(dtype):
s = pd.Series(SAMPLE_RAW_VALUES[dtype], dtype=dtype)
assert len(s) == 3
assert s[0], s[1] == sample_values[:2]
assert s[2] is None
assert s[2] is pd.NaT
assert s.nbytes == 24
assert isinstance(s.array, _cls(dtype))

Expand Down Expand Up @@ -166,8 +166,8 @@ def test_timearray_comparisons(
# Note that the right_obs comparisons work because
# they're called on right_obs rather then left, because
# TimeArrays only support comparisons with TimeArrays.
assert np.array_equal(comparisons[op](left, r), expected)
assert np.array_equal(complements[op](left, r), ~expected)
np.testing.assert_array_equal(comparisons[op](left, r), expected)
np.testing.assert_array_equal(complements[op](left, r), ~expected)

# Bad shape
for bad_shape in ([], [1, 2, 3]):
Expand All @@ -186,10 +186,10 @@ def test_timearray_comparisons(
[1], # a single-element array gets broadcast
):
if op == "==":
assert np.array_equal(
np.testing.assert_array_equal(
comparisons[op](left, np.array(bad_items)), np.array([False, False])
)
assert np.array_equal(
np.testing.assert_array_equal(
complements[op](left, np.array(bad_items)), np.array([True, True])
)
else:
Expand All @@ -204,7 +204,7 @@ def test_timearray_comparisons(
def test___getitem___arrayindex(dtype):
cls = _cls(dtype)
sample_values = SAMPLE_VALUES[dtype]
assert np.array_equal(
np.testing.assert_array_equal(
cls(sample_values)[[1, 3]], cls([sample_values[1], sample_values[3]]),
)

Expand All @@ -215,21 +215,23 @@ def test_timearray_slicing(dtype):
b = a[:]
assert b is not a
assert b.__class__ == a.__class__
assert np.array_equal(b, a)
np.testing.assert_array_equal(b._ndarray, a._ndarray)

sample_values = SAMPLE_VALUES[dtype]
cls = _cls(dtype)
assert np.array_equal(a[:1], cls._from_sequence(sample_values[:1]))
np.testing.assert_array_equal(
a[:1]._ndarray, cls._from_sequence(sample_values[:1])._ndarray
)

# Assignment works:
a[:1] = cls._from_sequence([sample_values[2]])
assert np.array_equal(
np.testing.assert_array_equal(
a[:2], cls._from_sequence([sample_values[2], sample_values[1]])
)

# Series also work:
s = pd.Series(SAMPLE_RAW_VALUES[dtype], dtype=dtype)
assert np.array_equal(s[:1].array, cls._from_sequence([sample_values[0]]))
np.testing.assert_array_equal(s[:1].array, cls._from_sequence([sample_values[0]]))


@for_date_and_time
Expand All @@ -238,9 +240,13 @@ def test_item_assignment(dtype):
sample_values = SAMPLE_VALUES[dtype]
cls = _cls(dtype)
a[0] = sample_values[2]
assert np.array_equal(a, cls._from_sequence([sample_values[2], sample_values[1]]))
np.testing.assert_array_equal(
a, cls._from_sequence([sample_values[2], sample_values[1]])
)
a[1] = None
assert np.array_equal(a, cls._from_sequence([sample_values[2], None]))
np.testing.assert_array_equal(
a._ndarray, cls._from_sequence([sample_values[2], None])._ndarray
)


@for_date_and_time
Expand All @@ -249,9 +255,9 @@ def test_array_assignment(dtype):
cls = _cls(dtype)
sample_values = SAMPLE_VALUES[dtype]
a[a.isna()] = sample_values[3]
assert np.array_equal(a, cls([sample_values[i] for i in (0, 1, 3)]))
np.testing.assert_array_equal(a, cls([sample_values[i] for i in (0, 1, 3)]))
a[[0, 2]] = sample_values[2]
assert np.array_equal(a, cls([sample_values[i] for i in (2, 1, 2)]))
np.testing.assert_array_equal(a, cls([sample_values[i] for i in (2, 1, 2)]))


@for_date_and_time
Expand All @@ -270,7 +276,7 @@ def test_copy(dtype):
b = a.copy()
assert b is not a
assert b._ndarray is not a._ndarray
assert np.array_equal(b, a)
np.testing.assert_array_equal(b, a)


@for_date_and_time
Expand All @@ -280,7 +286,7 @@ def test_from_ndarray_copy(dtype):
a = cls._from_sequence(sample_values)
b = cls(a._ndarray, copy=True)
assert b._ndarray is not a._ndarray
assert np.array_equal(b, a)
np.testing.assert_array_equal(b, a)


@for_date_and_time
Expand Down Expand Up @@ -310,7 +316,7 @@ def test__validate_scalar_invalid(dtype):
[
(False, None),
(True, None),
(True, pd._libs.NaT if pd else None),
(True, pd.NaT if pd else None),
(True, np.NaN if pd else None),
(True, 42),
],
Expand All @@ -326,7 +332,7 @@ def test_take(dtype, allow_fill, fill_value):
else datetime.time(0, 42, 42, 424242)
)
else:
expected_fill = None
expected_fill = pd.NaT
b = a.take([1, -1, 3], allow_fill=True, fill_value=fill_value)
expect = [sample_values[1], expected_fill, sample_values[3]]
else:
Expand Down Expand Up @@ -370,7 +376,7 @@ def test__concat_same_type_not_same_type(dtype):

@for_date_and_time
def test_dropna(dtype):
assert np.array_equal(_make_one(dtype).dropna(), _make_one(dtype)[:2])
np.testing.assert_array_equal(_make_one(dtype).dropna(), _make_one(dtype)[:2])


@pytest.mark.parametrize(
Expand Down Expand Up @@ -398,14 +404,18 @@ def test_fillna(dtype, value, meth, limit, expect):
elif value is not None:
value = sample_values[value]
expect = cls([None if i is None else sample_values[i] for i in expect])
assert np.array_equal(a.fillna(value, meth, limit), expect)
np.testing.assert_array_equal(
a.fillna(value, meth, limit)._ndarray, expect._ndarray
)


@for_date_and_time
def test_unique(dtype):
cls = _cls(dtype)
sample_values = SAMPLE_VALUES[dtype]
assert np.array_equal(cls(sample_values * 3).unique(), cls(sample_values),)
np.testing.assert_array_equal(
cls(sample_values * 3).unique(), cls(sample_values),
)


@for_date_and_time
Expand All @@ -421,7 +431,7 @@ def test_astype_copy(dtype):
b = a.astype(a.dtype, copy=True)
assert b is not a
assert b.__class__ is a.__class__
assert np.array_equal(b, a)
np.testing.assert_array_equal(b._ndarray, a._ndarray)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -452,7 +462,7 @@ def test_asdatetime(dtype, same):

b = a.astype(dt, copy=copy)
assert b is not a._ndarray
assert np.array_equal(b[:2], a._ndarray[:2])
np.testing.assert_array_equal(b[:2], a._ndarray[:2])
assert pd.isna(b[2]) and str(b[2]) == "NaT"


Expand Down Expand Up @@ -482,7 +492,7 @@ def test_astimedelta(dtype):

a = _cls("dbtime")([t, None])
b = a.astype(dtype)
np.array_equal(b[:1], expect)
np.testing.assert_array_equal(b[:1], expect)
assert pd.isna(b[1]) and str(b[1]) == "NaT"


Expand Down Expand Up @@ -531,21 +541,21 @@ def test_min_max_median(dtype):
)

empty = cls([])
assert empty.min() is None
assert empty.max() is None
assert empty.min() is pd.NaT
assert empty.max() is pd.NaT
if pandas_release >= (1, 2):
assert empty.median() is None
assert empty.median() is pd.NaT
empty = cls([None])
assert empty.min() is None
assert empty.max() is None
assert empty.min(skipna=False) is None
assert empty.max(skipna=False) is None
assert empty.min() is pd.NaT
assert empty.max() is pd.NaT
assert empty.min(skipna=False) is pd.NaT
assert empty.max(skipna=False) is pd.NaT
if pandas_release >= (1, 2):
with pytest.warns(RuntimeWarning, match="empty slice"):
# It's weird that we get the warning here, and not
# below. :/
assert empty.median() is None
assert empty.median(skipna=False) is None
assert empty.median() is pd.NaT
assert empty.median(skipna=False) is pd.NaT

a = _make_one(dtype)
assert a.min() == sample_values[0]
Expand All @@ -563,14 +573,14 @@ def test_date_add():
times = _cls("dbtime")(SAMPLE_VALUES["dbtime"])
expect = dates.astype("datetime64") + times.astype("timedelta64")

assert np.array_equal(dates + times, expect)
assert np.array_equal(times + dates, expect)
np.testing.assert_array_equal(dates + times, expect)
np.testing.assert_array_equal(times + dates, expect)

do = pd.DateOffset(days=1)
expect = dates.astype("object") + do
assert np.array_equal(dates + do, expect)
np.testing.assert_array_equal(dates + do, expect)
if pandas_release >= (1, 1):
assert np.array_equal(do + dates, expect)
np.testing.assert_array_equal(do + dates, expect)

with pytest.raises(TypeError):
dates + times.astype("timedelta64")
Expand All @@ -587,8 +597,8 @@ def test_date_add():

do = pd.Series([pd.DateOffset(days=i) for i in range(4)])
expect = dates.astype("object") + do
assert np.array_equal(dates + do, expect)
assert np.array_equal(do + dates, expect)
np.testing.assert_array_equal(dates + do, expect)
np.testing.assert_array_equal(do + dates, expect)


def test_date_sub():
Expand All @@ -602,11 +612,11 @@ def test_date_sub():
)
)
expect = dates.astype("datetime64") - dates2.astype("datetime64")
assert np.array_equal(dates - dates2, expect)
np.testing.assert_array_equal(dates - dates2, expect)

do = pd.DateOffset(days=1)
expect = dates.astype("object") - do
assert np.array_equal(dates - do, expect)
np.testing.assert_array_equal(dates - do, expect)

with pytest.raises(TypeError):
dates - 42
Expand All @@ -620,4 +630,4 @@ def test_date_sub():

do = pd.Series([pd.DateOffset(days=i) for i in range(4)])
expect = dates.astype("object") - do
assert np.array_equal(dates - do, expect)
np.testing.assert_array_equal(dates - do, expect)
Loading