Pandas 1.0 compat (dask#5782)

TomAugspurger · web-flow · commit 0b9a62b25057 · 2020-01-13T10:17:18.000-06:00
* Use pytest warns * Fixed duplicate index: xref pandas-dev/pandas#30965
diff --git a/dask/dataframe/io/parquet/arrow.py b/dask/dataframe/io/parquet/arrow.py
@@ -326,7 +326,12 @@ def read_partition(
         fs, piece, columns, index, categories=(), partitions=(), **kwargs
     ):
         if isinstance(index, list):
-            columns += index
+            for level in index:
+                # unclear if we can use set ops here. I think the order matters.
+                # Need the membership test to avoid duplicating index when
+                # we slice with `columns` later on.
+                if level not in columns:
+                    columns.append(level)
         if isinstance(piece, str):
             # `piece` is a file-path string
             piece = pq.ParquetDatasetPiece(
diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py
@@ -672,7 +672,7 @@ def test_compression_multiple_files():
         f.write(csv_text.encode())
         f.close()
 
-        with tm.assert_produces_warning(UserWarning):
+        with pytest.warns(UserWarning):
             df = dd.read_csv(os.path.join(tdir, "*.csv.gz"), compression="gzip")
 
         assert len(df.compute()) == (len(csv_text.split("\n")) - 1) * 2
diff --git a/dask/dataframe/io/tests/test_io.py b/dask/dataframe/io/tests/test_io.py
@@ -263,7 +263,7 @@ def test_from_pandas_small():
             ddf = dd.from_pandas(df, npartitions=5, sort=sort)
             assert_eq(df, ddf)
 
-            s = pd.Series([0] * i, name="x")
+            s = pd.Series([0] * i, name="x", dtype=int)
             ds = dd.from_pandas(s, npartitions=5, sort=sort)
             assert_eq(s, ds)
 
diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -9,7 +9,6 @@
 import dask
 import dask.multiprocessing
 import dask.dataframe as dd
-from dask.dataframe._compat import tm
 from dask.dataframe.utils import assert_eq, PANDAS_VERSION
 from dask.dataframe.io.parquet.utils import _parse_pandas_metadata
 from dask.dataframe.optimize import optimize_read_parquet_getitem
@@ -908,7 +907,7 @@ def test_empty_partition(tmpdir, engine):
 
 def test_timestamp_index(tmpdir, engine):
     fn = str(tmpdir)
-    df = tm.makeTimeDataFrame()
+    df = dd._compat.makeTimeDataFrame()
     df.index.name = "foo"
     ddf = dd.from_pandas(df, npartitions=5)
     ddf.to_parquet(fn, engine=engine)
diff --git a/dask/tests/test_base.py b/dask/tests/test_base.py
@@ -617,8 +617,8 @@ def test_compute_dataframe():
     ddf1 = ddf.a + 1
     ddf2 = ddf.a + ddf.b
     out1, out2 = compute(ddf1, ddf2)
-    pd.util.testing.assert_series_equal(out1, df.a + 1)
-    pd.util.testing.assert_series_equal(out2, df.a + df.b)
+    pd.testing.assert_series_equal(out1, df.a + 1)
+    pd.testing.assert_series_equal(out2, df.a + df.b)
 
 
 @pytest.mark.skipif("not dd or not da")