Skip to content

Commit f0cecf5

Browse files
committed
TST/CLN: Catch more warnings
Catches a handful of warnings from the dask.dataframe test suite. Remaining warnings fall into a few classes: 1. ResoucreWarnings from cloudpickls (e.g. dask/dataframe/tests/test_arithmetics_reduction.py::test_frame_series_arithmetic_methods) Not sure what to do here. Probably a legitimate concern of dasks? 2. RuntimeWarnings from numpy (e.g. dask/dataframe/tests/test_arithmetics_reduction.py::test_reductions[False] I think dask should catch these, (followup PR) 3. ImportWarning from dependencies importing C code? e.g. dask/dataframe/tests/test_categorical.py::test_categorical_set_index[disk] I have a fix for partd (msgpack). Pandas has some too. Still investigating, may be a Cython issue 4. DeprecationWarning on regexes. e.g. dask/dataframe/tests/test_groupby.py::test_full_groupby_multilevel[grouper4] I think these are all pandas. xref pandas-dev/pandas#16481 (comment) 5. RuntimeWarning from pandas merge, e.g. dask/dataframe/tests/test_multi.py::test_merge_by_multiple_columns[disk-inner] Filing an issue on pandas
1 parent e7e2cfe commit f0cecf5

10 files changed

+221
-166
lines changed

dask/dataframe/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,7 @@ def apply(self, func, meta=no_default):
911911
" Before: .apply(func)\n"
912912
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n"
913913
" or: .apply(func, meta=('x', 'f8')) for series result")
914-
warnings.warn(msg)
914+
warnings.warn(msg, stacklevel=2)
915915

916916
with raise_on_meta_error("groupby.apply({0})".format(funcname(func))):
917917
meta = self._meta_nonempty.apply(func)

dask/dataframe/io/tests/test_csv.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -436,11 +436,12 @@ def test_warn_non_seekable_files():
436436
assert 'blocksize=None' in msg
437437

438438
with pytest.warns(None) as w:
439-
df = dd.read_csv('2014-01-*.csv', compression='gzip', blocksize=None)
439+
df = dd.read_csv('2014-01-*.csv', compression='gzip',
440+
blocksize=None)
440441
assert len(w) == 0
441442

442443
with pytest.raises(NotImplementedError):
443-
with pytest.warns(None):
444+
with pytest.warns(UserWarning): # needed for pytest
444445
df = dd.read_csv('2014-01-*.csv', compression='foo')
445446

446447

@@ -730,8 +731,8 @@ def test_read_csv_sep():
730731
charlie###300""")
731732

732733
with filetext(sep_text) as fn:
733-
ddf = dd.read_csv(fn, sep="###")
734-
df = pd.read_csv(fn, sep="###")
734+
ddf = dd.read_csv(fn, sep="###", engine="python")
735+
df = pd.read_csv(fn, sep="###", engine="python")
735736

736737
assert (df.columns == ddf.columns).all()
737738
assert len(df) == len(ddf)
@@ -868,7 +869,7 @@ def test_to_csv_gzip():
868869
for npartitions in [1, 2]:
869870
a = dd.from_pandas(df, npartitions)
870871
with tmpfile('csv') as fn:
871-
a.to_csv(fn, compression='gzip')
872+
a.to_csv(fn, compression='gzip', sep=",")
872873
result = pd.read_csv(fn, index_col=0, compression='gzip')
873874
tm.assert_frame_equal(result, df)
874875

dask/dataframe/io/tests/test_io.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def test_from_pandas_with_datetime_index():
315315
"2015-08-25", "2015-08-24", "2015-08-21",
316316
"2015-08-20", "2015-08-19", "2015-08-18"],
317317
"Val": list(range(9))})
318-
df.Date = df.Date.astype('datetime64')
318+
df.Date = df.Date.astype('datetime64[ns]')
319319
ddf = dd.from_pandas(df, 2)
320320
assert_eq(df, ddf)
321321
ddf = dd.from_pandas(df, chunksize=2)

dask/dataframe/methods.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import print_function, absolute_import, division
22

3+
import warnings
4+
35
import numpy as np
46
import pandas as pd
57
from pandas.api.types import is_categorical_dtype
@@ -253,8 +255,11 @@ def concat(dfs, axis=0, join='outer', uniform=False):
253255
# concatenates.
254256
dfs3 = [df if isinstance(df, pd.DataFrame) else
255257
df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
256-
cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
257-
for df in dfs3], join=join).any()
258+
# pandas may raise a RuntimeWarning for comparing ints and strs
259+
with warnings.catch_warnings():
260+
warnings.simplefilter("ignore", RuntimeWarning)
261+
cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
262+
for df in dfs3], join=join).any()
258263

259264
if cat_mask.any():
260265
not_cat = cat_mask[~cat_mask].index
@@ -280,7 +285,10 @@ def concat(dfs, axis=0, join='outer', uniform=False):
280285
out[col] = union_categoricals(parts)
281286
out = out.reindex_axis(cat_mask.index, axis=1)
282287
else:
283-
out = pd.concat(dfs3, join=join)
288+
# pandas may raise a RuntimeWarning for comparing ints and strs
289+
with warnings.catch_warnings():
290+
warnings.simplefilter("ignore", RuntimeWarning)
291+
out = pd.concat(dfs3, join=join)
284292
else:
285293
if is_categorical_dtype(dfs2[0].dtype):
286294
if ind is None:

dask/dataframe/tests/test_arithmetics_reduction.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -635,9 +635,12 @@ def test_reductions(split_every):
635635
assert_eq(dds.min(split_every=split_every), pds.min())
636636
assert_eq(dds.max(split_every=split_every), pds.max())
637637
assert_eq(dds.count(split_every=split_every), pds.count())
638-
assert_eq(dds.std(split_every=split_every), pds.std())
639-
assert_eq(dds.var(split_every=split_every), pds.var())
640-
assert_eq(dds.sem(split_every=split_every), pds.sem())
638+
with pytest.warns(None):
639+
assert_eq(dds.std(split_every=split_every), pds.std())
640+
with pytest.warns(None):
641+
assert_eq(dds.var(split_every=split_every), pds.var())
642+
with pytest.warns(None):
643+
assert_eq(dds.sem(split_every=split_every), pds.sem())
641644
assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
642645
assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
643646
assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))

dask/dataframe/tests/test_dataframe.py

+17-11
Original file line numberDiff line numberDiff line change
@@ -1381,9 +1381,10 @@ def test_eval():
13811381
with pytest.raises(NotImplementedError):
13821382
d.eval('z = x + y', inplace=True)
13831383

1384-
if p.eval('z = x + y', inplace=None) is None:
1385-
with pytest.raises(NotImplementedError):
1386-
d.eval('z = x + y', inplace=None)
1384+
with pytest.warns(None):
1385+
if p.eval('z = x + y', inplace=None) is None:
1386+
with pytest.raises(NotImplementedError):
1387+
d.eval('z = x + y', inplace=None)
13871388

13881389

13891390
@pytest.mark.parametrize('include, exclude', [
@@ -1702,7 +1703,7 @@ def test_apply():
17021703
ddf = dd.from_pandas(df, npartitions=2)
17031704

17041705
func = lambda row: row['x'] + row['y']
1705-
assert_eq(ddf.x.apply(lambda x: x + 1),
1706+
assert_eq(ddf.x.apply(lambda x: x + 1, meta=("x", int)),
17061707
df.x.apply(lambda x: x + 1))
17071708

17081709
# specify meta
@@ -1712,16 +1713,17 @@ def test_apply():
17121713
df.apply(lambda xy: xy[0] + xy[1], axis='columns'))
17131714

17141715
# inference
1715-
assert_eq(ddf.apply(lambda xy: xy[0] + xy[1], axis=1),
1716+
assert_eq(ddf.apply(lambda xy: xy[0] + xy[1], axis=1, meta=(None, int)),
17161717
df.apply(lambda xy: xy[0] + xy[1], axis=1))
1717-
assert_eq(ddf.apply(lambda xy: xy, axis=1),
1718+
assert_eq(ddf.apply(lambda xy: xy, axis=1, meta={'x': int, 'y': int}),
17181719
df.apply(lambda xy: xy, axis=1))
17191720

17201721
# specify meta
17211722
func = lambda x: pd.Series([x, x])
17221723
assert_eq(ddf.x.apply(func, meta=[(0, int), (1, int)]), df.x.apply(func))
17231724
# inference
1724-
assert_eq(ddf.x.apply(func), df.x.apply(func))
1725+
with pytest.warns(UserWarning):
1726+
assert_eq(ddf.x.apply(func), df.x.apply(func))
17251727

17261728
# axis=0
17271729
with pytest.raises(NotImplementedError):
@@ -1886,13 +1888,15 @@ def return_df(x):
18861888
return pd.Series([x.sum(), x.mean()], index=['sum', 'mean'])
18871889

18881890
# DataFrame to completely different DataFrame
1889-
result = ddf.apply(return_df, axis=1)
1891+
with pytest.warns(UserWarning):
1892+
result = ddf.apply(return_df, axis=1)
18901893
assert isinstance(result, dd.DataFrame)
18911894
tm.assert_index_equal(result.columns, pd.Index(['sum', 'mean']))
18921895
assert_eq(result, df.apply(return_df, axis=1))
18931896

18941897
# DataFrame to Series
1895-
result = ddf.apply(lambda x: 1, axis=1)
1898+
with pytest.warns(UserWarning):
1899+
result = ddf.apply(lambda x: 1, axis=1)
18961900
assert isinstance(result, dd.Series)
18971901
assert result.name is None
18981902
assert_eq(result, df.apply(lambda x: 1, axis=1))
@@ -1901,13 +1905,15 @@ def return_df2(x):
19011905
return pd.Series([x * 2, x * 3], index=['x2', 'x3'])
19021906

19031907
# Series to completely different DataFrame
1904-
result = ddf.x.apply(return_df2)
1908+
with pytest.warns(UserWarning):
1909+
result = ddf.x.apply(return_df2)
19051910
assert isinstance(result, dd.DataFrame)
19061911
tm.assert_index_equal(result.columns, pd.Index(['x2', 'x3']))
19071912
assert_eq(result, df.x.apply(return_df2))
19081913

19091914
# Series to Series
1910-
result = ddf.x.apply(lambda x: 1)
1915+
with pytest.warns(UserWarning):
1916+
result = ddf.x.apply(lambda x: 1)
19111917
assert isinstance(result, dd.Series)
19121918
assert result.name == 'x'
19131919
assert_eq(result, df.x.apply(lambda x: 1))

dask/dataframe/tests/test_groupby.py

+64-47
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def func(df):
110110
return df
111111

112112
assert_eq(df.groupby('a').apply(func),
113-
ddf.groupby('a').apply(func))
113+
ddf.groupby('a').apply(func, meta={"a": int, "b": float}))
114114

115115

116116
@pytest.mark.parametrize('grouper', [
@@ -131,8 +131,10 @@ def func(df):
131131
df['b'] = df.b - df.b.mean()
132132
return df
133133

134+
# last one causes a DeprcationWarning from pandas, hard to track down...
134135
assert_eq(df.groupby(grouper(df)).apply(func),
135-
ddf.groupby(grouper(ddf)).apply(func))
136+
ddf.groupby(grouper(ddf)).apply(func, meta={"a": int, "d": int,
137+
"b": float}))
136138

137139

138140
def test_groupby_dir():
@@ -158,14 +160,15 @@ def func(df):
158160
return df.assign(b=df.b - df.b.mean())
159161

160162
with dask.set_options(get=get):
161-
assert_eq(ddf.groupby('a').apply(func),
162-
pdf.groupby('a').apply(func))
163+
with pytest.warns(None):
164+
assert_eq(ddf.groupby('a').apply(func),
165+
pdf.groupby('a').apply(func))
163166

164-
assert_eq(ddf.groupby('a').apply(func).set_index('a'),
165-
pdf.groupby('a').apply(func).set_index('a'))
167+
assert_eq(ddf.groupby('a').apply(func).set_index('a'),
168+
pdf.groupby('a').apply(func).set_index('a'))
166169

167-
assert_eq(pdf2.groupby(pdf2.index).apply(func),
168-
ddf2.groupby(ddf2.index).apply(func))
170+
assert_eq(pdf2.groupby(pdf2.index).apply(func),
171+
ddf2.groupby(ddf2.index).apply(func))
169172

170173

171174
def test_groupby_multilevel_getitem():
@@ -258,7 +261,8 @@ def test_series_groupby_propagates_names():
258261
df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
259262
ddf = dd.from_pandas(df, 2)
260263
func = lambda df: df['y'].sum()
261-
result = ddf.groupby('x').apply(func)
264+
with pytest.warns(UserWarning):
265+
result = ddf.groupby('x').apply(func)
262266
expected = df.groupby('x').apply(func)
263267
assert_eq(result, expected)
264268

@@ -507,40 +511,42 @@ def call(g, m, **kwargs):
507511

508512

509513
def test_apply_shuffle():
514+
import warnings; warnings.simplefilter("error", UserWarning)
510515
pdf = pd.DataFrame({'A': [1, 2, 3, 4] * 5,
511516
'B': np.random.randn(20),
512517
'C': np.random.randn(20),
513518
'D': np.random.randn(20)})
514519
ddf = dd.from_pandas(pdf, 3)
515520

516-
assert_eq(ddf.groupby('A').apply(lambda x: x.sum()),
517-
pdf.groupby('A').apply(lambda x: x.sum()))
521+
with pytest.warns(UserWarning):
522+
assert_eq(ddf.groupby('A').apply(lambda x: x.sum()),
523+
pdf.groupby('A').apply(lambda x: x.sum()))
518524

519-
assert_eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()),
520-
pdf.groupby(pdf['A']).apply(lambda x: x.sum()))
525+
assert_eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()),
526+
pdf.groupby(pdf['A']).apply(lambda x: x.sum()))
521527

522-
assert_eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()),
523-
pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum()))
528+
assert_eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()),
529+
pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum()))
524530

525-
# SeriesGroupBy
526-
assert_eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()),
527-
pdf.groupby('A')['B'].apply(lambda x: x.sum()))
531+
# SeriesGroupBy
532+
assert_eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()),
533+
pdf.groupby('A')['B'].apply(lambda x: x.sum()))
528534

529-
assert_eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()),
530-
pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum()))
535+
assert_eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()),
536+
pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum()))
531537

532-
assert_eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()),
533-
pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum()))
538+
assert_eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()),
539+
pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum()))
534540

535-
# DataFrameGroupBy with column slice
536-
assert_eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()),
537-
pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()))
541+
# DataFrameGroupBy with column slice
542+
assert_eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()),
543+
pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()))
538544

539-
assert_eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()),
540-
pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum()))
545+
assert_eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()),
546+
pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum()))
541547

542-
assert_eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()),
543-
pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
548+
assert_eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()),
549+
pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
544550

545551

546552
@pytest.mark.parametrize('grouper', [
@@ -559,17 +565,18 @@ def test_apply_shuffle_multilevel(grouper):
559565
'D': np.random.randn(20)})
560566
ddf = dd.from_pandas(pdf, 3)
561567

562-
# DataFrameGroupBy
563-
assert_eq(ddf.groupby(grouper(ddf)).apply(lambda x: x.sum()),
564-
pdf.groupby(grouper(pdf)).apply(lambda x: x.sum()))
568+
with pytest.warns(UserWarning):
569+
# DataFrameGroupBy
570+
assert_eq(ddf.groupby(grouper(ddf)).apply(lambda x: x.sum()),
571+
pdf.groupby(grouper(pdf)).apply(lambda x: x.sum()))
565572

566-
# SeriesGroupBy
567-
assert_eq(ddf.groupby(grouper(ddf))['B'].apply(lambda x: x.sum()),
568-
pdf.groupby(grouper(pdf))['B'].apply(lambda x: x.sum()))
573+
# SeriesGroupBy
574+
assert_eq(ddf.groupby(grouper(ddf))['B'].apply(lambda x: x.sum()),
575+
pdf.groupby(grouper(pdf))['B'].apply(lambda x: x.sum()))
569576

570-
# DataFrameGroupBy with column slice
571-
assert_eq(ddf.groupby(grouper(ddf))[['B', 'C']].apply(lambda x: x.sum()),
572-
pdf.groupby(grouper(pdf))[['B', 'C']].apply(lambda x: x.sum()))
577+
# DataFrameGroupBy with column slice
578+
assert_eq(ddf.groupby(grouper(ddf))[['B', 'C']].apply(lambda x: x.sum()),
579+
pdf.groupby(grouper(pdf))[['B', 'C']].apply(lambda x: x.sum()))
573580

574581

575582
def test_numeric_column_names():
@@ -581,7 +588,7 @@ def test_numeric_column_names():
581588
ddf = dd.from_pandas(df, npartitions=2)
582589
assert_eq(ddf.groupby(0).sum(), df.groupby(0).sum())
583590
assert_eq(ddf.groupby([0, 2]).sum(), df.groupby([0, 2]).sum())
584-
assert_eq(ddf.groupby(0).apply(lambda x: x),
591+
assert_eq(ddf.groupby(0).apply(lambda x: x, meta={0: int, 1: int, 2: int}),
585592
df.groupby(0).apply(lambda x: x))
586593

587594

@@ -594,12 +601,14 @@ def test_groupby_apply_tasks():
594601
with dask.set_options(shuffle='tasks'):
595602
for ind in [lambda x: 'A', lambda x: x.A]:
596603
a = df.groupby(ind(df)).apply(len)
597-
b = ddf.groupby(ind(ddf)).apply(len)
604+
with pytest.warns(UserWarning):
605+
b = ddf.groupby(ind(ddf)).apply(len)
598606
assert_eq(a, b.compute())
599607
assert not any('partd' in k[0] for k in b.dask)
600608

601609
a = df.groupby(ind(df)).B.apply(len)
602-
b = ddf.groupby(ind(ddf)).B.apply(len)
610+
with pytest.warns(UserWarning):
611+
b = ddf.groupby(ind(ddf)).B.apply(len)
603612
assert_eq(a, b.compute())
604613
assert not any('partd' in k[0] for k in b.dask)
605614

@@ -610,7 +619,8 @@ def test_groupby_multiprocessing():
610619
'B': ['1','1','a','a','a']})
611620
ddf = dd.from_pandas(df, npartitions=3)
612621
with dask.set_options(get=get):
613-
assert_eq(ddf.groupby('B').apply(lambda x: x),
622+
assert_eq(ddf.groupby('B').apply(lambda x: x, meta={"A": int,
623+
"B": object}),
614624
df.groupby('B').apply(lambda x: x))
615625

616626

@@ -652,8 +662,12 @@ def test_aggregate__examples(spec, split_every, grouper):
652662
columns=['c', 'b', 'a', 'd'])
653663
ddf = dd.from_pandas(pdf, npartitions=10)
654664

655-
assert_eq(pdf.groupby(grouper(pdf)).agg(spec),
656-
ddf.groupby(grouper(ddf)).agg(spec, split_every=split_every))
665+
# Warning from pandas deprecation .agg(dict[dict])
666+
# it's from pandas, so no reason to assert the deprecation warning,
667+
# but we should still test it for now
668+
with pytest.warns(None):
669+
assert_eq(pdf.groupby(grouper(pdf)).agg(spec),
670+
ddf.groupby(grouper(ddf)).agg(spec, split_every=split_every))
657671

658672

659673
@pytest.mark.parametrize('spec', [
@@ -678,9 +692,12 @@ def test_series_aggregate__examples(spec, split_every, grouper):
678692

679693
ddf = dd.from_pandas(pdf, npartitions=10)
680694
ds = ddf['c']
681-
682-
assert_eq(ps.groupby(grouper(pdf)).agg(spec),
683-
ds.groupby(grouper(ddf)).agg(spec, split_every=split_every))
695+
# Warning from pandas deprecation .agg(dict[dict])
696+
# it's from pandas, so no reason to assert the deprecation warning,
697+
# but we should still test it for now
698+
with pytest.warns(None):
699+
assert_eq(ps.groupby(grouper(pdf)).agg(spec),
700+
ds.groupby(grouper(ddf)).agg(spec, split_every=split_every))
684701

685702

686703
@pytest.mark.parametrize('spec', [

0 commit comments

Comments
 (0)