COMPAT: Pandas 0.22.0 astype for categorical dtypes

TomAugspurger · TomAugspurger · commit a25c8831c6bd · 2017-12-14T08:20:07.000-06:00
Change in pandas-dev/pandas#18710 caused a dask failure when reading CSV files, as our `.astype` relied on the old (broken) behavior. Closes dask#2996
diff --git a/dask/dataframe/io/csv.py b/dask/dataframe/io/csv.py
@@ -24,10 +24,12 @@
 
 if PANDAS_VERSION >= '0.20.0':
     from pandas.api.types import (is_integer_dtype, is_float_dtype,
-                                  is_object_dtype, is_datetime64_any_dtype)
+                                  is_object_dtype, is_datetime64_any_dtype,
+                                  is_categorical_dtype)
 else:
     from pandas.types.common import (is_integer_dtype, is_float_dtype,
-                                     is_object_dtype, is_datetime64_any_dtype)
+                                     is_object_dtype, is_datetime64_any_dtype,
+                                     is_categorical_dtype)
 
 
 delayed = delayed(pure=True)
@@ -70,6 +72,31 @@ def pandas_read_text(reader, b, header, kwargs, dtypes=None, columns=None,
     return df
 
 
+def _union_categorical_dtypes(previous, new):
+    """Union the dtypes from two blocks of categoricals
+
+    Parameters
+    ----------
+    previous : Index
+        The values in ``df[c].cat.categories``
+    new : str or CategoricalDtype
+        For old pandas, only the str 'category' is allowed.
+        For newer pandas, ``new`` may be a ``CategoricalDtype``
+
+    Returns
+    -------
+    unioned : str or CategoricalDtype
+    """
+    if isinstance(new, str):
+        # Should just be 'category'
+        return new
+    old_categories = previous.tolist()
+    new_categoires = new.categories.tolist()
+    # Index.union sorts, so we just append and then unique
+    unioned = pd.Index(old_categories + new_categoires).unique()
+    return pd.api.types.CategoricalDtype(unioned, ordered=new.ordered)
+
+
 def coerce_dtypes(df, dtypes):
     """ Coerce dataframe to dtypes safely
 
@@ -97,7 +124,11 @@ def coerce_dtypes(df, dtypes):
                 bad_dates.append(c)
             else:
                 try:
-                    df[c] = df[c].astype(dtypes[c])
+                    if is_categorical_dtype(df[c]):
+                        dtype = _union_categorical_dtypes(df[c].cat.categories, dtypes[c])
+                        df[c] = df[c].astype(dtype)
+                    else:
+                        df[c] = df[c].astype(dtypes[c])
                 except Exception as e:
                     bad_dtypes.append((c, actual, desired))
                     errors.append((c, e))
diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py
@@ -384,6 +384,40 @@ def test_categorical_dtypes():
                 ['apple', 'banana', 'orange', 'pear'])
 
 
+@pytest.mark.skipif(PANDAS_VERSION < '0.20.0',
+                    reason="Uses CategoricalDtype")
+def test_categorical_ordered():
+    text1 = normalize_text("""
+    A
+    a
+    b
+    a
+    """)
+    text2 = normalize_text("""
+    A
+    a
+    b
+    c
+    """)
+    dtype = pd.api.types.CategoricalDtype(['a', 'b', 'c'])
+    with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}):
+        result = dd.read_csv("foo.*.csv", dtype={"A": 'category'})
+        expected = pd.DataFrame({
+            "A": pd.Categorical(['a', 'b', 'a', 'a', 'b', 'c'],
+                                categories=dtype.categories)},
+                                index=[0, 1, 2, 0, 1, 2])
+        assert_eq(result, expected)
+
+        result = dd.read_csv("foo.*.csv", dtype=dtype)
+        assert_eq(result, expected)
+
+        # ordered
+        dtype = pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=True)
+        result = dd.read_csv("foo.*.csv", dtype=dtype)
+        expected['A'] = expected['A'].cat.as_ordered()
+        assert_eq(result, expected)
+
+
 @pytest.mark.slow
 def test_compression_multiple_files():
     with tmpdir() as tdir:
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -24,6 +24,7 @@ bounds indexes (:pr:`2967`) `Stephan Hoyer`_
 DataFrame
 +++++++++
 
+- Compatability with pandas 0.22.0 (:issue:`2996`) `Tom Augspurger`_
 - Prevent ``bool()`` coercion from calling compute (:pr:`2958`) `Albert DeFusco`_
 - ``DataFrame.read_sql()`` (:pr:`2928`) to an empty database tables returns an empty dask dataframe `Apostolos Vlachopoulos`_
 - Fixed ``dd.concat`` losing the index dtype when the data contained a categorical (:issue:`2932`) `Tom Augspurger`_