TST/CLN: Catch more warnings

TomAugspurger · TomAugspurger · commit f0cecf57c902 · 2017-05-24T16:20:05.000-05:00
Catches a handful of warnings from the dask.dataframe test suite. Remaining warnings fall into a few classes: 1. ResoucreWarnings from cloudpickls (e.g. dask/dataframe/tests/test_arithmetics_reduction.py::test_frame_series_arithmetic_methods) Not sure what to do here. Probably a legitimate concern of dasks? 2. RuntimeWarnings from numpy (e.g. dask/dataframe/tests/test_arithmetics_reduction.py::test_reductions[False] I think dask should catch these, (followup PR) 3. ImportWarning from dependencies importing C code? e.g. dask/dataframe/tests/test_categorical.py::test_categorical_set_index[disk] I have a fix for partd (msgpack). Pandas has some too. Still investigating, may be a Cython issue 4. DeprecationWarning on regexes. e.g. dask/dataframe/tests/test_groupby.py::test_full_groupby_multilevel[grouper4] I think these are all pandas. xref pandas-dev/pandas#16481 (comment) 5. RuntimeWarning from pandas merge, e.g. dask/dataframe/tests/test_multi.py::test_merge_by_multiple_columns[disk-inner] Filing an issue on pandas
diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py
@@ -911,7 +911,7 @@ def apply(self, func, meta=no_default):
                    "  Before: .apply(func)\n"
                    "  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n"
                    "  or:     .apply(func, meta=('x', 'f8'))            for series result")
-            warnings.warn(msg)
+            warnings.warn(msg, stacklevel=2)
 
             with raise_on_meta_error("groupby.apply({0})".format(funcname(func))):
                 meta = self._meta_nonempty.apply(func)
diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py
@@ -436,11 +436,12 @@ def test_warn_non_seekable_files():
         assert 'blocksize=None' in msg
 
         with pytest.warns(None) as w:
-            df = dd.read_csv('2014-01-*.csv', compression='gzip', blocksize=None)
+            df = dd.read_csv('2014-01-*.csv', compression='gzip',
+                             blocksize=None)
         assert len(w) == 0
 
         with pytest.raises(NotImplementedError):
-            with pytest.warns(None):
+            with pytest.warns(UserWarning):  # needed for pytest
                 df = dd.read_csv('2014-01-*.csv', compression='foo')
 
 
@@ -730,8 +731,8 @@ def test_read_csv_sep():
     charlie###300""")
 
     with filetext(sep_text) as fn:
-        ddf = dd.read_csv(fn, sep="###")
-        df = pd.read_csv(fn, sep="###")
+        ddf = dd.read_csv(fn, sep="###", engine="python")
+        df = pd.read_csv(fn, sep="###", engine="python")
 
         assert (df.columns == ddf.columns).all()
         assert len(df) == len(ddf)
@@ -868,7 +869,7 @@ def test_to_csv_gzip():
     for npartitions in [1, 2]:
         a = dd.from_pandas(df, npartitions)
         with tmpfile('csv') as fn:
-            a.to_csv(fn, compression='gzip')
+            a.to_csv(fn, compression='gzip', sep=",")
             result = pd.read_csv(fn, index_col=0, compression='gzip')
             tm.assert_frame_equal(result, df)
 
diff --git a/dask/dataframe/io/tests/test_io.py b/dask/dataframe/io/tests/test_io.py
@@ -315,7 +315,7 @@ def test_from_pandas_with_datetime_index():
                                 "2015-08-25", "2015-08-24", "2015-08-21",
                                 "2015-08-20", "2015-08-19", "2015-08-18"],
                        "Val": list(range(9))})
-    df.Date = df.Date.astype('datetime64')
+    df.Date = df.Date.astype('datetime64[ns]')
     ddf = dd.from_pandas(df, 2)
     assert_eq(df, ddf)
     ddf = dd.from_pandas(df, chunksize=2)
diff --git a/dask/dataframe/methods.py b/dask/dataframe/methods.py
@@ -1,5 +1,7 @@
 from __future__ import print_function, absolute_import, division
 
+import warnings
+
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_categorical_dtype
@@ -253,8 +255,11 @@ def concat(dfs, axis=0, join='outer', uniform=False):
             # concatenates.
             dfs3 = [df if isinstance(df, pd.DataFrame) else
                     df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
-            cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
-                                  for df in dfs3], join=join).any()
+            # pandas may raise a RuntimeWarning for comparing ints and strs
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", RuntimeWarning)
+                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
+                                      for df in dfs3], join=join).any()
 
         if cat_mask.any():
             not_cat = cat_mask[~cat_mask].index
@@ -280,7 +285,10 @@ def concat(dfs, axis=0, join='outer', uniform=False):
                 out[col] = union_categoricals(parts)
             out = out.reindex_axis(cat_mask.index, axis=1)
         else:
-            out = pd.concat(dfs3, join=join)
+            # pandas may raise a RuntimeWarning for comparing ints and strs
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", RuntimeWarning)
+                out = pd.concat(dfs3, join=join)
     else:
         if is_categorical_dtype(dfs2[0].dtype):
             if ind is None:
diff --git a/dask/dataframe/tests/test_arithmetics_reduction.py b/dask/dataframe/tests/test_arithmetics_reduction.py
@@ -635,9 +635,12 @@ def test_reductions(split_every):
         assert_eq(dds.min(split_every=split_every), pds.min())
         assert_eq(dds.max(split_every=split_every), pds.max())
         assert_eq(dds.count(split_every=split_every), pds.count())
-        assert_eq(dds.std(split_every=split_every), pds.std())
-        assert_eq(dds.var(split_every=split_every), pds.var())
-        assert_eq(dds.sem(split_every=split_every), pds.sem())
+        with pytest.warns(None):
+            assert_eq(dds.std(split_every=split_every), pds.std())
+        with pytest.warns(None):
+            assert_eq(dds.var(split_every=split_every), pds.var())
+        with pytest.warns(None):
+            assert_eq(dds.sem(split_every=split_every), pds.sem())
         assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
         assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
         assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
diff --git a/dask/dataframe/tests/test_dataframe.py b/dask/dataframe/tests/test_dataframe.py
@@ -1381,9 +1381,10 @@ def test_eval():
         with pytest.raises(NotImplementedError):
             d.eval('z = x + y', inplace=True)
 
-        if p.eval('z = x + y', inplace=None) is None:
-            with pytest.raises(NotImplementedError):
-                d.eval('z = x + y', inplace=None)
+        with pytest.warns(None):
+            if p.eval('z = x + y', inplace=None) is None:
+                with pytest.raises(NotImplementedError):
+                        d.eval('z = x + y', inplace=None)
 
 
 @pytest.mark.parametrize('include, exclude', [
@@ -1702,7 +1703,7 @@ def test_apply():
     ddf = dd.from_pandas(df, npartitions=2)
 
     func = lambda row: row['x'] + row['y']
-    assert_eq(ddf.x.apply(lambda x: x + 1),
+    assert_eq(ddf.x.apply(lambda x: x + 1, meta=("x", int)),
               df.x.apply(lambda x: x + 1))
 
     # specify meta
@@ -1712,16 +1713,17 @@ def test_apply():
               df.apply(lambda xy: xy[0] + xy[1], axis='columns'))
 
     # inference
-    assert_eq(ddf.apply(lambda xy: xy[0] + xy[1], axis=1),
+    assert_eq(ddf.apply(lambda xy: xy[0] + xy[1], axis=1, meta=(None, int)),
               df.apply(lambda xy: xy[0] + xy[1], axis=1))
-    assert_eq(ddf.apply(lambda xy: xy, axis=1),
+    assert_eq(ddf.apply(lambda xy: xy, axis=1, meta={'x': int, 'y': int}),
               df.apply(lambda xy: xy, axis=1))
 
     # specify meta
     func = lambda x: pd.Series([x, x])
     assert_eq(ddf.x.apply(func, meta=[(0, int), (1, int)]), df.x.apply(func))
     # inference
-    assert_eq(ddf.x.apply(func), df.x.apply(func))
+    with pytest.warns(UserWarning):
+        assert_eq(ddf.x.apply(func), df.x.apply(func))
 
     # axis=0
     with pytest.raises(NotImplementedError):
@@ -1886,13 +1888,15 @@ def return_df(x):
         return pd.Series([x.sum(), x.mean()], index=['sum', 'mean'])
 
     # DataFrame to completely different DataFrame
-    result = ddf.apply(return_df, axis=1)
+    with pytest.warns(UserWarning):
+        result = ddf.apply(return_df, axis=1)
     assert isinstance(result, dd.DataFrame)
     tm.assert_index_equal(result.columns, pd.Index(['sum', 'mean']))
     assert_eq(result, df.apply(return_df, axis=1))
 
     # DataFrame to Series
-    result = ddf.apply(lambda x: 1, axis=1)
+    with pytest.warns(UserWarning):
+        result = ddf.apply(lambda x: 1, axis=1)
     assert isinstance(result, dd.Series)
     assert result.name is None
     assert_eq(result, df.apply(lambda x: 1, axis=1))
@@ -1901,13 +1905,15 @@ def return_df2(x):
         return pd.Series([x * 2, x * 3], index=['x2', 'x3'])
 
     # Series to completely different DataFrame
-    result = ddf.x.apply(return_df2)
+    with pytest.warns(UserWarning):
+        result = ddf.x.apply(return_df2)
     assert isinstance(result, dd.DataFrame)
     tm.assert_index_equal(result.columns, pd.Index(['x2', 'x3']))
     assert_eq(result, df.x.apply(return_df2))
 
     # Series to Series
-    result = ddf.x.apply(lambda x: 1)
+    with pytest.warns(UserWarning):
+        result = ddf.x.apply(lambda x: 1)
     assert isinstance(result, dd.Series)
     assert result.name == 'x'
     assert_eq(result, df.x.apply(lambda x: 1))
diff --git a/dask/dataframe/tests/test_groupby.py b/dask/dataframe/tests/test_groupby.py
@@ -110,7 +110,7 @@ def func(df):
         return df
 
     assert_eq(df.groupby('a').apply(func),
-              ddf.groupby('a').apply(func))
+              ddf.groupby('a').apply(func, meta={"a": int, "b": float}))
 
 
 @pytest.mark.parametrize('grouper', [
@@ -131,8 +131,10 @@ def func(df):
         df['b'] = df.b - df.b.mean()
         return df
 
+    # last one causes a DeprcationWarning from pandas, hard to track down...
     assert_eq(df.groupby(grouper(df)).apply(func),
-              ddf.groupby(grouper(ddf)).apply(func))
+              ddf.groupby(grouper(ddf)).apply(func, meta={"a": int, "d": int,
+                                                          "b": float}))
 
 
 def test_groupby_dir():
@@ -158,14 +160,15 @@ def func(df):
         return df.assign(b=df.b - df.b.mean())
 
     with dask.set_options(get=get):
-        assert_eq(ddf.groupby('a').apply(func),
-                  pdf.groupby('a').apply(func))
+        with pytest.warns(None):
+            assert_eq(ddf.groupby('a').apply(func),
+                      pdf.groupby('a').apply(func))
 
-        assert_eq(ddf.groupby('a').apply(func).set_index('a'),
-                  pdf.groupby('a').apply(func).set_index('a'))
+            assert_eq(ddf.groupby('a').apply(func).set_index('a'),
+                      pdf.groupby('a').apply(func).set_index('a'))
 
-        assert_eq(pdf2.groupby(pdf2.index).apply(func),
-                  ddf2.groupby(ddf2.index).apply(func))
+            assert_eq(pdf2.groupby(pdf2.index).apply(func),
+                      ddf2.groupby(ddf2.index).apply(func))
 
 
 def test_groupby_multilevel_getitem():
@@ -258,7 +261,8 @@ def test_series_groupby_propagates_names():
     df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
     ddf = dd.from_pandas(df, 2)
     func = lambda df: df['y'].sum()
-    result = ddf.groupby('x').apply(func)
+    with pytest.warns(UserWarning):
+        result = ddf.groupby('x').apply(func)
     expected = df.groupby('x').apply(func)
     assert_eq(result, expected)
 
@@ -507,40 +511,42 @@ def call(g, m, **kwargs):
 
 
 def test_apply_shuffle():
+    import warnings; warnings.simplefilter("error", UserWarning)
     pdf = pd.DataFrame({'A': [1, 2, 3, 4] * 5,
                         'B': np.random.randn(20),
                         'C': np.random.randn(20),
                         'D': np.random.randn(20)})
     ddf = dd.from_pandas(pdf, 3)
 
-    assert_eq(ddf.groupby('A').apply(lambda x: x.sum()),
-              pdf.groupby('A').apply(lambda x: x.sum()))
+    with pytest.warns(UserWarning):
+        assert_eq(ddf.groupby('A').apply(lambda x: x.sum()),
+                  pdf.groupby('A').apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A']).apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A']).apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum()))
 
-    # SeriesGroupBy
-    assert_eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()),
-              pdf.groupby('A')['B'].apply(lambda x: x.sum()))
+        # SeriesGroupBy
+        assert_eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()),
+                  pdf.groupby('A')['B'].apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum()))
 
-    # DataFrameGroupBy with column slice
-    assert_eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()),
-              pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()))
+        # DataFrameGroupBy with column slice
+        assert_eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()),
+                  pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum()))
 
-    assert_eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()),
-              pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
+        assert_eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()),
+                  pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
 
 
 @pytest.mark.parametrize('grouper', [
@@ -559,17 +565,18 @@ def test_apply_shuffle_multilevel(grouper):
                         'D': np.random.randn(20)})
     ddf = dd.from_pandas(pdf, 3)
 
-    # DataFrameGroupBy
-    assert_eq(ddf.groupby(grouper(ddf)).apply(lambda x: x.sum()),
-              pdf.groupby(grouper(pdf)).apply(lambda x: x.sum()))
+    with pytest.warns(UserWarning):
+        # DataFrameGroupBy
+        assert_eq(ddf.groupby(grouper(ddf)).apply(lambda x: x.sum()),
+                  pdf.groupby(grouper(pdf)).apply(lambda x: x.sum()))
 
-    # SeriesGroupBy
-    assert_eq(ddf.groupby(grouper(ddf))['B'].apply(lambda x: x.sum()),
-              pdf.groupby(grouper(pdf))['B'].apply(lambda x: x.sum()))
+        # SeriesGroupBy
+        assert_eq(ddf.groupby(grouper(ddf))['B'].apply(lambda x: x.sum()),
+                  pdf.groupby(grouper(pdf))['B'].apply(lambda x: x.sum()))
 
-    # DataFrameGroupBy with column slice
-    assert_eq(ddf.groupby(grouper(ddf))[['B', 'C']].apply(lambda x: x.sum()),
-              pdf.groupby(grouper(pdf))[['B', 'C']].apply(lambda x: x.sum()))
+        # DataFrameGroupBy with column slice
+        assert_eq(ddf.groupby(grouper(ddf))[['B', 'C']].apply(lambda x: x.sum()),
+                  pdf.groupby(grouper(pdf))[['B', 'C']].apply(lambda x: x.sum()))
 
 
 def test_numeric_column_names():
@@ -581,7 +588,7 @@ def test_numeric_column_names():
     ddf = dd.from_pandas(df, npartitions=2)
     assert_eq(ddf.groupby(0).sum(), df.groupby(0).sum())
     assert_eq(ddf.groupby([0, 2]).sum(), df.groupby([0, 2]).sum())
-    assert_eq(ddf.groupby(0).apply(lambda x: x),
+    assert_eq(ddf.groupby(0).apply(lambda x: x, meta={0: int, 1: int, 2: int}),
               df.groupby(0).apply(lambda x: x))
 
 
@@ -594,12 +601,14 @@ def test_groupby_apply_tasks():
     with dask.set_options(shuffle='tasks'):
         for ind in [lambda x: 'A', lambda x: x.A]:
             a = df.groupby(ind(df)).apply(len)
-            b = ddf.groupby(ind(ddf)).apply(len)
+            with pytest.warns(UserWarning):
+                b = ddf.groupby(ind(ddf)).apply(len)
             assert_eq(a, b.compute())
             assert not any('partd' in k[0] for k in b.dask)
 
             a = df.groupby(ind(df)).B.apply(len)
-            b = ddf.groupby(ind(ddf)).B.apply(len)
+            with pytest.warns(UserWarning):
+                b = ddf.groupby(ind(ddf)).B.apply(len)
             assert_eq(a, b.compute())
             assert not any('partd' in k[0] for k in b.dask)
 
@@ -610,7 +619,8 @@ def test_groupby_multiprocessing():
                        'B': ['1','1','a','a','a']})
     ddf = dd.from_pandas(df, npartitions=3)
     with dask.set_options(get=get):
-        assert_eq(ddf.groupby('B').apply(lambda x: x),
+        assert_eq(ddf.groupby('B').apply(lambda x: x, meta={"A": int,
+                                                            "B": object}),
                   df.groupby('B').apply(lambda x: x))
 
 
@@ -652,8 +662,12 @@ def test_aggregate__examples(spec, split_every, grouper):
                        columns=['c', 'b', 'a', 'd'])
     ddf = dd.from_pandas(pdf, npartitions=10)
 
-    assert_eq(pdf.groupby(grouper(pdf)).agg(spec),
-              ddf.groupby(grouper(ddf)).agg(spec, split_every=split_every))
+    # Warning from pandas deprecation .agg(dict[dict])
+    # it's from pandas, so no reason to assert the deprecation warning,
+    # but we should still test it for now
+    with pytest.warns(None):
+        assert_eq(pdf.groupby(grouper(pdf)).agg(spec),
+                  ddf.groupby(grouper(ddf)).agg(spec, split_every=split_every))
 
 
 @pytest.mark.parametrize('spec', [
@@ -678,9 +692,12 @@ def test_series_aggregate__examples(spec, split_every, grouper):
 
     ddf = dd.from_pandas(pdf, npartitions=10)
     ds = ddf['c']
-
-    assert_eq(ps.groupby(grouper(pdf)).agg(spec),
-              ds.groupby(grouper(ddf)).agg(spec, split_every=split_every))
+    # Warning from pandas deprecation .agg(dict[dict])
+    # it's from pandas, so no reason to assert the deprecation warning,
+    # but we should still test it for now
+    with pytest.warns(None):
+        assert_eq(ps.groupby(grouper(pdf)).agg(spec),
+                ds.groupby(grouper(ddf)).agg(spec, split_every=split_every))
 
 
 @pytest.mark.parametrize('spec', [
diff --git a/dask/dataframe/tests/test_multi.py b/dask/dataframe/tests/test_multi.py
diff --git a/dask/dataframe/tests/test_rolling.py b/dask/dataframe/tests/test_rolling.py
diff --git a/dask/dataframe/tests/test_ufunc.py b/dask/dataframe/tests/test_ufunc.py