COMPAT: For pandas 0.21 CategoricalDtype

TomAugspurger · TomAugspurger · commit f8e382cc3bcf · 2017-10-05T06:40:52.000-05:00
diff --git a/dask/array/percentile.py b/dask/array/percentile.py
@@ -17,7 +17,7 @@ def _percentile(a, q, interpolation='linear'):
         return None
     if isinstance(q, Iterator):
         q = list(q)
-    if str(a.dtype) == 'category':
+    if a.dtype.name == 'category':
         result = np.percentile(a.codes, q, interpolation=interpolation)
         import pandas as pd
         return pd.Categorical.from_codes(result, a.categories, a.ordered)
@@ -100,7 +100,7 @@ def merge_percentiles(finalq, qs, vals, Ns, interpolation='lower'):
 
     # TODO: Perform this check above in percentile once dtype checking is easy
     #       Here we silently change meaning
-    if str(vals[0].dtype) == 'category':
+    if vals[0].dtype.name == 'category':
         result = merge_percentiles(finalq, qs, [v.codes for v in vals], Ns, interpolation)
         import pandas as pd
         return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered)
diff --git a/dask/dataframe/io/tests/test_io.py b/dask/dataframe/io/tests/test_io.py
@@ -4,8 +4,7 @@
 
 import pytest
 from threading import Lock
-
-import threading
+from multiprocessing.pool import ThreadPool
 
 import dask.array as da
 import dask.dataframe as dd
@@ -15,7 +14,7 @@
 from dask.utils import tmpfile
 from dask.local import get_sync
 
-from dask.dataframe.utils import assert_eq
+from dask.dataframe.utils import assert_eq, is_categorical_dtype
 
 
 ####################
@@ -119,13 +118,14 @@ def test_from_array_with_record_dtype():
 
 def test_from_bcolz_multiple_threads():
     bcolz = pytest.importorskip('bcolz')
+    pool = ThreadPool(processes=5)
 
-    def check():
+    def check(i):
         t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                          names=['x', 'y', 'a'])
         d = dd.from_bcolz(t, chunksize=2)
         assert d.npartitions == 2
-        assert str(d.dtypes['a']) == 'category'
+        assert is_categorical_dtype(d.dtypes['a'])
         assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
         assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']
 
@@ -139,14 +139,7 @@ def check():
         assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
                 sorted(dd.from_bcolz(t, chunksize=3).dask))
 
-    threads = []
-    for i in range(5):
-        thread = threading.Thread(target=check)
-        thread.start()
-        threads.append(thread)
-
-    for thread in threads:
-        thread.join()
+    pool.map(check, range(5))
 
 
 def test_from_bcolz():
@@ -156,7 +149,7 @@ def test_from_bcolz():
                      names=['x', 'y', 'a'])
     d = dd.from_bcolz(t, chunksize=2)
     assert d.npartitions == 2
-    assert str(d.dtypes['a']) == 'category'
+    assert is_categorical_dtype(d.dtypes['a'])
     assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
     assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']
     L = list(d.index.compute(get=get_sync))
diff --git a/dask/dataframe/partitionquantiles.py b/dask/dataframe/partitionquantiles.py
@@ -79,6 +79,7 @@
 from ..utils import random_state_data
 from ..base import tokenize
 from .core import Series
+from .utils import is_categorical_dtype
 from dask.compatibility import zip
 
 
@@ -363,7 +364,7 @@ def process_val_weights(vals_and_weights, npartitions, dtype_info):
         rv = np.concatenate([trimmed, jumbo_vals])
         rv.sort()
 
-    if str(dtype) == 'category':
+    if is_categorical_dtype(dtype):
         rv = pd.Categorical.from_codes(rv, info[0], info[1])
     elif 'datetime64' in str(dtype):
         rv = pd.DatetimeIndex(rv, dtype=dtype)
@@ -398,7 +399,7 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
     qs = sample_percentiles(num_old, num_new, length, upsample, random_state)
     data = df.values
     interpolation = 'linear'
-    if str(data.dtype) == 'category':
+    if is_categorical_dtype(data):
         data = data.codes
         interpolation = 'nearest'
     vals = _percentile(data, qs, interpolation=interpolation)
@@ -410,7 +411,7 @@ def percentiles_summary(df, num_old, num_new, upsample, state):
 
 def dtype_info(df):
     info = None
-    if str(df.dtype) == 'category':
+    if is_categorical_dtype(df):
         data = df.values
         info = (data.categories, data.ordered)
     return df.dtype, info
diff --git a/dask/dataframe/tests/test_categorical.py b/dask/dataframe/tests/test_categorical.py
@@ -202,7 +202,7 @@ def test_categorize_index():
 @pytest.mark.parametrize('shuffle', ['disk', 'tasks'])
 def test_categorical_set_index(shuffle):
     df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': ['a', 'b', 'b', 'c']})
-    df['y'] = df.y.astype('category', ordered=True)
+    df['y'] = pd.Categorical(df['y'], categories=['a', 'b', 'c'], ordered=True)
     a = dd.from_pandas(df, npartitions=2)
 
     with dask.set_options(get=dask.get, shuffle=shuffle):
diff --git a/dask/dataframe/tests/test_dataframe.py b/dask/dataframe/tests/test_dataframe.py
@@ -2202,9 +2202,9 @@ def test_categorize_info():
                                      "Int64Index: 4 entries, 0 to 3\n"
                                      "Data columns (total 3 columns):\n"
                                      "x    4 non-null int64\n"
-                                     "y    4 non-null category\n"
+                                     "y    4 non-null CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)\n"  # noqa
                                      "z    4 non-null object\n"
-                                     "dtypes: category(1), object(1), int64(1)")
+                                     "dtypes: CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)(1), object(1), int64(1)")  # noqa
 
 
 def test_gh_1301():
diff --git a/dask/dataframe/tests/test_format.py b/dask/dataframe/tests/test_format.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 import pandas as pd
+from textwrap import dedent
 
 import dask.dataframe as dd
 from dask.dataframe.utils import PANDAS_VERSION
@@ -440,14 +441,26 @@ def test_index_format():
     s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8],
                   index=pd.CategoricalIndex([1, 2, 3, 4, 5, 6, 7, 8], name='YYY'))
     ds = dd.from_pandas(s, 3)
-    exp = """Dask Index Structure:
-npartitions=3
-1    category[known]
-4                ...
-7                ...
-8                ...
-Name: YYY, dtype: category
-Dask Name: from_pandas, 6 tasks"""
+    if PANDAS_VERSION >= '0.21.0':
+        exp = dedent("""\
+        Dask Index Structure:
+        npartitions=3
+        1    category[known]
+        4                ...
+        7                ...
+        8                ...
+        Name: YYY, dtype: CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7, 8], ordered=False)
+        Dask Name: from_pandas, 6 tasks""")
+    else:
+        exp = dedent("""\
+        Dask Index Structure:
+        npartitions=3
+        1    category[known]
+        4                ...
+        7                ...
+        8                ...
+        Name: YYY, dtype: category
+        Dask Name: from_pandas, 6 tasks""")
     assert repr(ds.index) == exp
     assert str(ds.index) == exp
 
@@ -456,17 +469,36 @@ def test_categorical_format():
     s = pd.Series(['a', 'b', 'c']).astype('category')
     known = dd.from_pandas(s, npartitions=1)
     unknown = known.cat.as_unknown()
-    exp = ("Dask Series Structure:\n"
-           "npartitions=1\n"
-           "0    category[known]\n"
-           "2                ...\n"
-           "dtype: category\n"
-           "Dask Name: from_pandas, 1 tasks")
+    if PANDAS_VERSION >= '0.21.0':
+        exp = dedent("""\
+        Dask Series Structure:
+        npartitions=1
+        0    category[known]
+        2                ...
+        dtype: CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)
+        Dask Name: from_pandas, 1 tasks""")
+    else:
+        exp = ("Dask Series Structure:\n"
+               "npartitions=1\n"
+               "0    category[known]\n"
+               "2                ...\n"
+               "dtype: category\n"
+               "Dask Name: from_pandas, 1 tasks")
     assert repr(known) == exp
-    exp = ("Dask Series Structure:\n"
-           "npartitions=1\n"
-           "0    category[unknown]\n"
-           "2                  ...\n"
-           "dtype: category\n"
-           "Dask Name: from_pandas, 1 tasks")
+    if PANDAS_VERSION >= '0.21.0':
+        exp = dedent("""\
+        Dask Series Structure:
+        npartitions=1
+        0    category[unknown]
+        2                  ...
+        dtype: CategoricalDtype(categories=['__UNKNOWN_CATEGORIES__'], ordered=False)
+        Dask Name: from_pandas, 1 tasks""")
+
+    else:
+        exp = ("Dask Series Structure:\n"
+               "npartitions=1\n"
+               "0    category[unknown]\n"
+               "2                  ...\n"
+               "dtype: category\n"
+               "Dask Name: from_pandas, 1 tasks")
     assert repr(unknown) == exp
diff --git a/dask/dataframe/tests/test_utils_dataframe.py b/dask/dataframe/tests/test_utils_dataframe.py
@@ -163,7 +163,8 @@ def test_meta_nonempty_empty_categories():
         # Series
         s = idx.to_series()
         res = meta_nonempty(s)
-        assert res.dtype == s.dtype
+        assert res.dtype == 'category'
+        assert s.dtype == 'category'
         assert type(res.cat.categories) is type(s.cat.categories)
         assert res.cat.ordered == s.cat.ordered
         assert res.name == s.name
@@ -302,13 +303,30 @@ def test_check_meta():
     df2 = df[['a', 'b', 'd', 'e']]
     with pytest.raises(ValueError) as err:
         check_meta(df2, meta2, funcname='from_delayed')
-    assert str(err.value) == ('Metadata mismatch found in `from_delayed`.\n'
-                              '\n'
-                              'Partition type: `DataFrame`\n'
-                              '+--------+----------+----------+\n'
-                              '| Column | Found    | Expected |\n'
-                              '+--------+----------+----------+\n'
-                              '| a      | object   | category |\n'
-                              '| c      | -        | float64  |\n'
-                              '| e      | category | -        |\n'
-                              '+--------+----------+----------+')
+
+    if PANDAS_VERSION >= '0.21.0':
+        exp = (
+            'Metadata mismatch found in `from_delayed`.\n'
+            '\n'
+            'Partition type: `DataFrame`\n'
+            '+--------+-------------------------------------------------------------+------------------------------------------------+\n'  # noqa
+            '| Column | Found                                                       | Expected                                       |\n'  # noqa
+            '+--------+-------------------------------------------------------------+------------------------------------------------+\n'  # noqa
+            '| a      | object                                                      | CategoricalDtype(categories=[], ordered=False) |\n'  # noqa
+            '| c      | -                                                           | float64                                        |\n'  # noqa
+            "| e      | CategoricalDtype(categories=['x', 'y', 'z'], ordered=False) | -                                              |\n"  # noqa
+            '+--------+-------------------------------------------------------------+------------------------------------------------+'    # noqa
+        )
+    else:
+        exp = (
+            'Metadata mismatch found in `from_delayed`.\n'
+            '\n'
+            'Partition type: `DataFrame`\n'
+            '+--------+----------+----------+\n'
+            '| Column | Found    | Expected |\n'
+            '+--------+----------+----------+\n'
+            '| a      | object   | category |\n'
+            '| c      | -        | float64  |\n'
+            '| e      | category | -        |\n'
+            '+--------+----------+----------+')
+    assert str(err.value) == exp
diff --git a/dask/dataframe/utils.py b/dask/dataframe/utils.py
@@ -454,6 +454,13 @@ def equal_dtypes(a, b):
             return False
         if (a is '-' or b is '-'):
             return False
+        if is_categorical_dtype(a) and is_categorical_dtype(b):
+            # Pandas 0.21 CategoricalDtype compat
+            if (PANDAS_VERSION >= '0.21.0' and
+                    (UNKNOWN_CATEGORIES in a.categories or
+                     UNKNOWN_CATEGORIES in b.categories)):
+                return True
+            return a == b
         return (a.kind in eq_types and b.kind in eq_types) or (a == b)
 
     if not isinstance(meta, (pd.Series, pd.Index, pd.DataFrame)):