Skip to content

[WIP] implement tests using hypothesis #18761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ dist
coverage.xml
coverage_html_report

# hypothesis test database
.hypothesis/

# OS generated files #
######################
.directory
Expand Down
334 changes: 334 additions & 0 deletions pandas/tests/tseries/offsets/test_behavior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
# -*- coding: utf-8 -*-
"""
Behavioral based tests for offsets and date_range.
"""
from datetime import timedelta

import pytest
from hypothesis import given, assume
import hypothesis.strategies as st
import hypothesis.extra.numpy as hen
import hypothesis.extra.pytz as hepytz # hypothesis[pytz]

import pandas as pd

from pandas.tseries.offsets import (Hour, Minute, Second, Milli, Micro, Nano,
MonthEnd, MonthBegin,
BMonthEnd, BMonthBegin,
QuarterEnd, QuarterBegin,
BQuarterEnd, BQuarterBegin,
YearEnd, YearBegin,
BYearEnd, BYearBegin,
Week, LastWeekOfMonth, WeekOfMonth,
SemiMonthBegin, SemiMonthEnd,
Easter,
FY5253, FY5253Quarter,
DateOffset)
# TODO:
# BusinessDay, BusinessHour, CustomBusinessDay, CustomBusinessHour,
# CustomBusinessMonthEnd, CustomBusinessMonthBegin


tick_classes = [Hour, Minute, Second, Milli, Micro, Nano]
yqm_classes = [MonthBegin, MonthEnd, BMonthBegin, BMonthEnd,
QuarterBegin, QuarterEnd, BQuarterBegin, BQuarterEnd,
YearBegin, YearEnd, BYearBegin, BYearEnd]
offset_types = [Week, LastWeekOfMonth, WeekOfMonth, SemiMonthEnd,
SemiMonthBegin, FY5253Quarter, FY5253,
Easter, DateOffset] + tick_classes + yqm_classes

# ----------------------------------------------------------------
# Helpers for generating random data

dt_max = pd.Timestamp.max.replace(nanosecond=0).to_pydatetime()
td_max = timedelta(106751, 85636, 854775)
td_min = -td_max - timedelta(microseconds=1)

n_strategy = st.integers(min_value=-999, max_value=999)
# TODO: Choose these bounds systematically. (-999, 999) is arbitrarily chosen
# to get rid of OverflowErrors in development
month_strategy = st.integers(min_value=1, max_value=12)
weekday_strategy = st.integers(min_value=0, max_value=6)


def gen_dst_crossing():
# Generate either a pair of Timestamps or a date_range that is known
# to cross a DST transition
raise NotImplementedError


def gen_date_range_freq():
# return a freq str or offset object suitable for passing as
# `freq` kwarg to date_range
return st.sampled_from(['Y', 'Q', 'M', 'D', 'H',
'T', 's', 'ms', 'us', 'ns'])
# TODO: Add the rest; business, multiples, ...


@st.composite
def gen_random_date_range(draw):
# TODO: Choose the min/max values more systematically
start = st.datetimes(min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just datetime(1900, 1, 1)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this was copy/pasted from somewhere else where it was originally pd.Timestamp.min, had to adjust to avoid OverfowErrors.

max_value=pd.Timestamp(2100, 1, 1).to_pydatetime())
periods = st.integers(min_value=10, max_value=100)
freq = gen_date_range_freq()
tz = gen_random_tz()

dti = pd.date_range(start=draw(start), tz=draw(tz),
freq=draw(freq), periods=draw(periods))
return dti


def gen_random_tz():
# Allows None
return st.one_of(st.none(), hepytz.timezones())
# TODO: Weighting between naive and timezones?
# TODO: Get datetuil timezones?
Copy link
Contributor

@pganssle pganssle Dec 13, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been briefly in contact with @DRMacIver about adding a hypothesis.extras.dateutil extra to do just this. It may require some additional public interface stuff on the dateutil side.

That said, it seems that all timezones provided by pytz map identically to dateutil zones, for all datetimes less than 2038-01-01. To prove it, here's a hypothesis test I ran:

from hypothesis import given, assume
from hypothesis import strategies as st
from hypothesis.extra import pytz as hepytz

from dateutil import tz
from datetime import datetime

@given(dt=st.datetimes(), tzi=hepytz.timezones())
def test_dateutil_compat(dt, tzi):
    tzi_du = tz.gettz(str(tzi))
    dt_pytz = tzi.localize(dt)
    dt_du = dt.replace(tzinfo=tzi_du)

    assume(dt < datetime(2038, 1, 1))
    assert dt_pytz == dt_du

So you should be able to get dateutil zones from tz.gettz(str(pytz_zone))

See dateutil/dateutil#590 for the reason why the assume(dt < datetime(2038, 1, 1) is in there.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, does this need to be a function? Why not just assign timezone_strategy = st.one_of([None, hepytz.timezones()])?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, does this need to be a function?

Probably not. I'm still getting the hang of @composite and, data, and draw.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you should be able to get dateutil zones from tz.gettz(str(pytz_zone))

That looks great, thanks. I'll take a look at dateutil-590 to see if I can be helpful.



gen_random_datetime = st.datetimes(min_value=pd.Timestamp.min.to_pydatetime(),
max_value=pd.Timestamp.max.to_pydatetime(),
timezones=gen_random_tz())


def gen_random_timestamp():
nano = st.integers(min_value=0, max_value=999)
dt = st.datetimes(min_value=pd.Timestamp.min.to_pydatetime(),
max_value=pd.Timestamp.max.to_pydatetime(),
timezones=gen_random_tz())
ts = pd.Timestamp(dt)

if dt != dt_max:
ts.replace(nanosecond=nano)
else:
ts = ts.replace(nanosecond=min(nano, pd.Timestamp.max.nanosecond))

# TODO: worry about timezones near min/max?
return ts


def gen_random_datelike():
# py_dates = st.dates()
py_datetimes = gen_random_datetime

# dt64_dtypes = hen.datetime64_dtypes()
# np_dates = hen.arrays(dtype=dt64_dtypes, shape=())
# TODO: Allow for non-scalar versions?
# FIXME: dt64.__add__(offset) does not get dispatched to
# offset.__radd__(dt64), just raises TypeError

any_dates = st.one_of(py_datetimes)
return any_dates


def gen_timedeltalike():
py_timedeltas = st.timedeltas(min_value=td_min, max_value=td_max)
pd_timedeltas = py_timedeltas.map(pd.Timedelta)
# TODO: get those last few nanoseconds?

td64_dtypes = hen.timedelta64_dtypes()
np_timedeltas = hen.arrays(dtype=td64_dtypes, shape=())
# TODO: Allow for non-scalar versions?

# TODO: Week
# TODO: Tick
any_tds = st.one_of(py_timedeltas, pd_timedeltas, np_timedeltas)
return any_tds


@st.composite
def gen_random_relativedelta_DateOffset(draw):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need to add some assumptions here to make sure that your year, month, week, day, etc values are valid. You can do this by drawing them from integers with min/max set, drawing them as you are and then adding assume statements, or (and this might be a bit out there), draw a random datetime() and just use the values from that (won't work for week, but you can calculate the week from the datetime if you want to use it, I imagine).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh most definitely. I just threw those in and figured I'd put in the appropriate bounds one-by-one as errors came up.

relativedelta_kwds = set([
'years', 'months', 'weeks', 'days',
'year', 'month', 'week', 'day', 'weekday',
'hour', 'minute', 'second', 'microsecond',
'nanosecond', 'nanoseconds',
'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'])
kwargs = {kwd: st.integers() for kwd in relativedelta_kwds}
kwargs['n'] = st.integers()
kwargs['normalize'] = st.booleans()
kwargs = {key: draw(kwargs[key]) for key in kwargs}
return DateOffset(**kwargs)


@st.composite
def gen_random_offset(draw, cls):
# Note: `draw` is a dummy argument that gets supplied by the composite
# decorator
n = n_strategy
normalize = st.booleans()

if cls in tick_classes + [MonthBegin, MonthEnd, BMonthBegin, BMonthEnd,
Easter]:
n = n.filter(lambda x: abs(x) < 100) # TODO: avoid arbitrary cutoff
tup = st.tuples(n, normalize)

elif cls in [QuarterBegin, QuarterEnd, BQuarterBegin, BQuarterEnd]:
n = n.filter(lambda x: abs(x) < 25) # TODO: avoid arbitrary cutoff
startingMonth = month_strategy
tup = st.tuples(n, normalize, startingMonth)

elif cls in [YearBegin, YearEnd, BYearBegin, BYearEnd]:
n = n.filter(lambda x: abs(x) < 6) # TODO: avoid arbitrary cutoff
month = month_strategy
tup = st.tuples(n, normalize, month)

elif cls == Week:
n = n.filter(lambda x: abs(x) < 400) # TODO: avoid arbitrary cutoff
weekday = st.sampled_from([None, 0, 1, 2, 3, 4, 5, 6])
tup = st.tuples(n, normalize, weekday)

elif cls == LastWeekOfMonth:
n = n.filter(lambda x: abs(x) < 400) # TODO: avoid arbitrary cutoff
n = n.filter(lambda x: x != 0)
weekday = weekday_strategy
tup = st.tuples(n, normalize, weekday)

elif cls == WeekOfMonth:
n = n.filter(lambda x: abs(x) < 400) # TODO: avoid arbitrary cutoff
n = n.filter(lambda x: x != 0)
week = st.integers(min_value=0, max_value=3)
weekday = weekday_strategy
tup = st.tuples(n, normalize, week, weekday)

elif cls in [SemiMonthBegin, SemiMonthEnd]:
n = n.filter(lambda x: abs(x) < 800) # TODO: avoid arbitrary cutoff
day_of_month = st.integers(min_value=cls._min_day_of_month,
max_value=27)
tup = st.tuples(n, normalize, day_of_month)

elif cls is FY5253:
n = n.filter(lambda x: abs(x) < 6) # TODO: avoid arbitrary cutoff
n = n.filter(lambda x: x != 0)
weekday = weekday_strategy
startingMonth = month_strategy
variation = st.sampled_from(["nearest", "last"])
tup = st.tuples(n, normalize, weekday, startingMonth, variation)

elif cls is FY5253Quarter:
n = n.filter(lambda x: abs(x) < 24) # TODO: avoid arbitrary cutoff
n = n.filter(lambda x: x != 0)
weekday = weekday_strategy
startingMonth = month_strategy
qtr_with_extra_week = st.integers(min_value=1, max_value=4)
variation = st.sampled_from(["nearest", "last"])
tup = st.tuples(n, normalize, weekday, startingMonth,
qtr_with_extra_week, variation)

elif cls is DateOffset:
# klass = cls(days=value, normalize=normalize)
return gen_random_relativedelta_DateOffset()

else:
raise NotImplementedError(cls)

args = draw(tup)
return cls(*args)

# ----------------------------------------------------------------
# Tick-specific behavior tests


@given(n=n_strategy, m=n_strategy)
@pytest.mark.parametrize('cls', tick_classes)
def test_tick_add_sub(cls, n, m):
# For all Tick subclasses and all integers n, m, we should have
# tick(n) + tick(m) == tick(n+m)
# tick(n) - tick(m) == tick(n-m)
left = cls(n)
right = cls(m)
expected = cls(n + m)

assert left + right == expected
assert left.apply(right) == expected

expected = cls(n - m)
assert left - right == expected


@given(n=n_strategy, m=n_strategy)
@pytest.mark.parametrize('cls', tick_classes)
def test_tick_equality(cls, n, m):
# tick == tock iff tick.n == tock.n
left = cls(n)
right = cls(m)
if n == m:
assert left == right
assert not (left != right)
else:
assert left != right
assert not (left == right)


# ----------------------------------------------------------------

@given(dt=gen_random_datelike(), data=st.data())
@pytest.mark.parametrize('cls', offset_types)
def test_on_offset_implementations(cls, dt, data):
# check that the class-specific implementations of onOffset match
# the general case definition:
# (dt + offset) - offset == dt

offset = data.draw(gen_random_offset(cls), label='offset')
# TODO: Is there a more performant way to do this?

assume(not offset.normalize)
compare = (dt + offset) - offset
expected = compare == dt

res = offset.onOffset(dt)
assert res == expected


@given(data=st.data())
@pytest.mark.parametrize('cls', yqm_classes)
def test_apply_index_implementations(cls, data):
# offset.apply_index(dti)[i] should match dti[i] + offset

offset = data.draw(gen_random_offset(cls), label='offset')
assume(offset.n != 0) # TODO: test for that case separately

# rng = pd.date_range(start='1/1/2000', periods=100000, freq='T')
rng = data.draw(gen_random_date_range(), label='rng')
ser = pd.Series(rng)

res = rng + offset
res_v2 = offset.apply_index(rng)
assert (res == res_v2).all()

assert res[0] == rng[0] + offset
assert res[-1] == rng[-1] + offset
res2 = ser + offset
# apply_index is only for indexes, not series, so no res2_v2
assert res2.iloc[0] == ser.iloc[0] + offset
assert res2.iloc[-1] == ser.iloc[-1] + offset
# TODO: Check randomly assorted entries, not just first/last


@given(freq=gen_date_range_freq())
def test_range_matches_addition(freq):

raise pytest.skip('Need to generate date_range args')
dr = pd.date_range('2016-10-30 12:00:00', freq=freq,
periods=20, tz='US/Eastern')
assert dr[-1] > pd.Timestamp('2016-11-10') # DST transition is crossed

res = dr + freq
assert res[:-1].equals(dr[1:])


@given(data=st.data())
@pytest.mark.parametrize('cls', yqm_classes)
def test_shift_across_dst(cls, data):
# GH#18319 check that 1) timezone is correctly normalized and
# 2) that hour is not incorrectly changed by this normalization

raise pytest.skip('Need to generate date_range args')
offset = data.draw(gen_random_offset(cls), label='offset')
dti = pd.date_range(start='2017-10-30 12:00:00', end='2017-11-06',
freq='D', tz='US/Eastern')
# dti includes a transition across DST boundary
assert (dti.hour == 12).all() # we haven't screwed up yet

res = dti + offset
assert (res.hour == 12).all()