pandas-dev · sushobhit27 · Apr 3, 2018 · Apr 3, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml
@@ -13,3 +13,4 @@ dependencies:
   - pytz
   - setuptools>=3.3
   - sphinx
+  - hypothesis>=3.46.0
diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run
@@ -18,3 +18,4 @@ patsy
 pymysql=0.6.3
 jinja2=2.8
 xarray=0.8.0
+hypothesis>=3.46.0
diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run
@@ -12,3 +12,4 @@ pymysql=0.6.0
 sqlalchemy=0.7.8
 xlsxwriter=0.5.2
 jinja2=2.8
+hypothesis>=3.46.0
diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run
@@ -10,3 +10,4 @@ matplotlib=1.4.3
 sqlalchemy=0.8.1
 lxml
 scipy
+hypothesis>=3.46.0
diff --git a/ci/requirements-2.7_WIN.run b/ci/requirements-2.7_WIN.run
@@ -16,3 +16,4 @@ bottleneck
 html5lib
 beautifulsoup4
 jinja2=2.8
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run
@@ -18,3 +18,4 @@ psycopg2
 s3fs
 beautifulsoup4
 ipython
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.5_ASCII.run b/ci/requirements-3.5_ASCII.run
@@ -1,3 +1,4 @@
 python-dateutil
 pytz
 numpy
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run
@@ -14,3 +14,4 @@ bottleneck
 xarray
 s3fs
 beautifulsoup4
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run
@@ -23,3 +23,4 @@ beautifulsoup4
 s3fs
 xarray
 ipython
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6_LOCALE.run b/ci/requirements-3.6_LOCALE.run
@@ -20,3 +20,4 @@ beautifulsoup4
 s3fs
 xarray
 ipython
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6_LOCALE_SLOW.run b/ci/requirements-3.6_LOCALE_SLOW.run
@@ -20,3 +20,4 @@ beautifulsoup4
 s3fs
 xarray
 ipython
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6_NUMPY_DEV.run b/ci/requirements-3.6_NUMPY_DEV.run
@@ -1 +1,2 @@
 pytz
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6_SLOW.run b/ci/requirements-3.6_SLOW.run
@@ -17,3 +17,4 @@ psycopg2
 pymysql
 html5lib
 beautifulsoup4
+hypothesis>=3.46.0
diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run
@@ -15,3 +15,4 @@ blosc
 thrift=0.10*
 fastparquet
 pyarrow
+hypothesis>=3.46.0
diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
@@ -775,6 +775,80 @@ Tests that we have ``parametrized`` are now accessible via the test name, for ex
    test_cool_feature.py::test_dtypes[int8] PASSED
    test_cool_feature.py::test_series[int8] PASSED
 
+Using ``hypothesis``
+~~~~~~~~~~~~~~~~~~~~
+With the transition to pytest, things have become easier for testing by having reduced boilerplate for test cases and also by utilizing pytest's features like parametizing, skipping and marking test cases.
+
+However, one has to still come up with input data examples which can be tested against the functionality. There is always a possibility to skip testing an example which could have failed the test case.
+
+Hypothesis is a python package which helps in overcoming this issue by generating the input data based on some set of specifications provided by the user.
+e.g suppose we have to test python's sum function for a list of int.
+
+Here is a sample test case using pytest:
+
+.. code-block:: python
+
+    import pytest
+
+    @pytest.mark.parametrize('seq', [
+        [0, 0, 0],
+        [1, 2, 3, 4],
+        [-3, 5, -8, 23],
+        [12345678, 9876543, 567894321]
+    ])
+    def test_sum_using_pytest(seq):
+        total = 0
+        for item in seq:
+            total += item
+        assert sum(seq) == total
+
+output of test cases:
+
+.. code-block:: shell
+
+    collecting ... collected 4 items
+    pytest_example.py::test_sum_using_pytest[seq0] PASSED                    [ 25%]
+    pytest_example.py::test_sum_using_pytest[seq1] PASSED                    [ 50%]
+    pytest_example.py::test_sum_using_pytest[seq2] PASSED                    [ 75%]
+    pytest_example.py::test_sum_using_pytest[seq3] PASSED                    [100%]
+
+    ========================== 4 passed in 0.06 seconds ===========================
+
+
+Compare it with below example for the same test case using hypothesis.
+
+.. code-block:: python
+
+    from hypothesis import strategies as st
+    from hypothesis import given
+
+
+    @given(st.lists(st.integers()))
+    def test_sum(seq):
+        total = 0
+        for item in seq:
+            total += item
+        assert sum(seq) == total
+
+
+output of test cases:
+
+.. code-block:: shell
+
+    collecting ... collected 1 item
+    hypothesis_example.py::test_sum PASSED                                   [100%]
+
+    ========================== 1 passed in 0.33 seconds ===========================
+
+The main difference in above example is use of a decorator "@given(st.lists(st.integers()))" which if applied to test case function, generates some random list of int, which is then assigned to parameter of test case.
+Above example clearly helps in adding more coverage for our test functions.
+
+For more information about hypothesis or in general about property based testing, check below links:
+
+- https://hypothesis.readthedocs.io/en/latest/quickstart.html
+- https://hypothesis.works/articles/what-is-property-based-testing/
+- http://blog.jessitron.com/2013/04/property-based-testing-what-is-it.html
+
 
 Running the test suite
 ----------------------

diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py
@@ -4,31 +4,60 @@
 import pandas.util.testing as tm
 from pandas.core.reshape.util import cartesian_product
 
+import string
+from datetime import date
+from dateutil import relativedelta
+
+from pandas.util._hypothesis import (st,
+                                     given,
+                                     settings,
+                                     get_seq,
+                                     assume)
+
+
+NO_OF_EXAMPLES_PER_TEST_CASE = 20
+
 
 class TestCartesianProduct(object):
 
-    def test_simple(self):
-        x, y = list('ABC'), [1, 22]
+    @settings(max_examples=NO_OF_EXAMPLES_PER_TEST_CASE)
+    @given(get_seq((str,), False, 1, 1),
+           get_seq((int,), False, 1, 2))
+    def test_simple(self, x, y):
+        x = list(x[0])
+        # non-empty test case is handled in test_empty,
+        # therefore ignore it here.
+        assume(len(x) != 0)
         result1, result2 = cartesian_product([x, y])
-        expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
-        expected2 = np.array([1, 22, 1, 22, 1, 22])
+        expected1 = np.array([item1 for item1 in x for item2 in y])
+        expected2 = np.array([item2 for item1 in x for item2 in y])
+
         tm.assert_numpy_array_equal(result1, expected1)
         tm.assert_numpy_array_equal(result2, expected2)
 
-    def test_datetimeindex(self):
+    @settings(max_examples=NO_OF_EXAMPLES_PER_TEST_CASE)
+    @given(st.dates(min_value=date(1900, 1, 1), max_value=date(2100, 1, 1)))
+    def test_datetimeindex(self, d):
         # regression test for GitHub issue #6439
         # make sure that the ordering on datetimeindex is consistent
-        x = date_range('2000-01-01', periods=2)
+        n = d + relativedelta.relativedelta(days=1)
+        x = date_range(d, periods=2)
         result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
-        expected1 = Index([1, 1, 2, 2])
-        expected2 = Index([1, 2, 1, 2])
+        expected1 = Index([d.day, d.day, n.day, n.day])
+        expected2 = Index([d.day, n.day, d.day, n.day])
+
         tm.assert_index_equal(result1, expected1)
         tm.assert_index_equal(result2, expected2)
 
-    def test_empty(self):
+    @settings(max_examples=NO_OF_EXAMPLES_PER_TEST_CASE)
+    @given(st.lists(st.nothing()),
+           get_seq((int,), False, min_size=1, max_size=10),
+           get_seq((str,), False, min_size=1, max_size=10))
+    def test_empty(self, empty_list, list_of_int, list_of_str):
         # product of empty factors
-        X = [[], [0, 1], []]
-        Y = [[], [], ['a', 'b', 'c']]
+        X = [empty_list, list_of_int, empty_list]
+        Y = [empty_list, empty_list, list_of_str]
+
         for x, y in zip(X, Y):
             expected1 = np.array([], dtype=np.asarray(x).dtype)
             expected2 = np.array([], dtype=np.asarray(y).dtype)
@@ -37,13 +66,24 @@ def test_empty(self):
             tm.assert_numpy_array_equal(result2, expected2)
 
         # empty product (empty input):
-        result = cartesian_product([])
+        result = cartesian_product(empty_list)
         expected = []
         assert result == expected
 
-    def test_invalid_input(self):
-        invalid_inputs = [1, [1], [1, 2], [[1], 2],
-                          'a', ['a'], ['a', 'b'], [['a'], 'b']]
+    @settings(max_examples=NO_OF_EXAMPLES_PER_TEST_CASE)
+    @given(st.integers(),
+           st.text(string.ascii_letters, min_size=1),
+           get_seq((int, str), True, min_size=1),
+           st.builds(lambda *x: list(x), st.integers(),
+                     st.text(string.ascii_letters, min_size=1),
+                     st.lists(st.integers(), min_size=1)))
+    def test_invalid_input(self, number, text, seq, mixed_seq):
+
+        invalid_inputs = [number,
+                          text,
+                          seq,
+                          mixed_seq]
+
         msg = "Input must be a list-like of list-likes"
         for X in invalid_inputs:
             tm.assert_raises_regex(TypeError, msg, cartesian_product, X=X)
diff --git a/pandas/util/_hypothesis.py b/pandas/util/_hypothesis.py
@@ -0,0 +1,97 @@
+import string
+from hypothesis import (given,
+                        settings,
+                        assume,
+                        strategies as st,
+                        )
+
+
+def get_elements(elem_type):
+    strategy = st.nothing()
+    if elem_type == bool:
+        strategy = st.booleans()
+    elif elem_type == int:
+        strategy = st.integers()
+    elif elem_type == float:
+        strategy = st.floats()
+    elif elem_type == str:
+        strategy = st.text(string.ascii_letters, max_size=10)
+    return strategy
+
+
+@st.composite
+def get_seq(draw, types, mixed=False, min_size=None, max_size=None,
+            transform_func=None):
+    """
+    Helper function to generate strategy for creating lists.
+    What constitute in the generated list is driven by the different
+    parameters.
+
+    Parameters
+    ----------
+    types: iterable sequence like tuple or list
+        types which can be in the generated list.
+    mixed: bool
+        if True, list will contains elements from all types listed in arg,
+        otherwise it will have elements only from types[0].
+    min_size: int
+        minimum size of the list.
+    max_size: int
+        maximum size of the list.
+    transform_func: callable
+        a callable which can be applied to whole list after it has been
+         generated. It can think of as providing functionality of filter
+         and map function.
+
+    Returns
+    -------
+    hypothesis lists strategy.
+
+    Examples
+    --------
+    seq_strategy = get_seq((int, str, bool),
+                            mixed=True, min_size=1, max_size=5)
+    seq_strategy.example()
+    Out[12]: ['lkYMSn', -2501, 35, 'J']
+    seq_strategy.example()
+    Out[13]: [True]
+    seq_strategy.example()
+    Out[14]: ['dRWgQYrBrW', True, False, 'gmsujJVDBM', 'Z']
+
+    seq_strategy = get_seq((int, bool),
+                            mixed=False,
+                            min_size=1,
+                            max_size=5,
+                            transform_func=lambda seq: [str(x) for x in seq])
+    seq_strategy.example()
+    Out[19]: ['-1892']
+    seq_strategy.example()
+    Out[20]: ['22', '66', '14785', '-26312', '32']
+    seq_strategy.example()
+    Out[21]: ['22890', '-15537', '96']
+    """
+    strategy = st.nothing()
+    if min_size is None:
+        min_size = draw(st.integers(min_value=0, max_value=100))
+
+    if max_size is None:
+        max_size = draw(st.integers(min_value=min_size, max_value=100))
+
+    assert min_size <= max_size, \
+        'max_size must be greater than equal to min_size'
+
+    elem_strategies = []
+    for elem_type in types:
+        elem_strategies.append(get_elements(elem_type))
+        if not mixed:
+            break
+
+    if transform_func:
+        strategy = draw(st.lists(st.one_of(elem_strategies),
+                                 min_size=min_size,
+                                 max_size=max_size).map(transform_func))
+    else:
+        strategy = draw(st.lists(st.one_of(elem_strategies),
+                                 min_size=min_size,
+                                 max_size=max_size))
+    return strategy
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ dependencies: @@
       - pytz
       - setuptools>=3.3
       - sphinx
+      - hypothesis>=3.46.0