Skip to content

Commit a25c883

Browse files
committed
COMPAT: Pandas 0.22.0 astype for categorical dtypes
Change in pandas-dev/pandas#18710 caused a dask failure when reading CSV files, as our `.astype` relied on the old (broken) behavior. Closes dask#2996
1 parent 2b1b640 commit a25c883

File tree

3 files changed

+69
-3
lines changed

3 files changed

+69
-3
lines changed

dask/dataframe/io/csv.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@
2424

2525
if PANDAS_VERSION >= '0.20.0':
2626
from pandas.api.types import (is_integer_dtype, is_float_dtype,
27-
is_object_dtype, is_datetime64_any_dtype)
27+
is_object_dtype, is_datetime64_any_dtype,
28+
is_categorical_dtype)
2829
else:
2930
from pandas.types.common import (is_integer_dtype, is_float_dtype,
30-
is_object_dtype, is_datetime64_any_dtype)
31+
is_object_dtype, is_datetime64_any_dtype,
32+
is_categorical_dtype)
3133

3234

3335
delayed = delayed(pure=True)
@@ -70,6 +72,31 @@ def pandas_read_text(reader, b, header, kwargs, dtypes=None, columns=None,
7072
return df
7173

7274

75+
def _union_categorical_dtypes(previous, new):
76+
"""Union the dtypes from two blocks of categoricals
77+
78+
Parameters
79+
----------
80+
previous : Index
81+
The values in ``df[c].cat.categories``
82+
new : str or CategoricalDtype
83+
For old pandas, only the str 'category' is allowed.
84+
For newer pandas, ``new`` may be a ``CategoricalDtype``
85+
86+
Returns
87+
-------
88+
unioned : str or CategoricalDtype
89+
"""
90+
if isinstance(new, str):
91+
# Should just be 'category'
92+
return new
93+
old_categories = previous.tolist()
94+
new_categoires = new.categories.tolist()
95+
# Index.union sorts, so we just append and then unique
96+
unioned = pd.Index(old_categories + new_categoires).unique()
97+
return pd.api.types.CategoricalDtype(unioned, ordered=new.ordered)
98+
99+
73100
def coerce_dtypes(df, dtypes):
74101
""" Coerce dataframe to dtypes safely
75102
@@ -97,7 +124,11 @@ def coerce_dtypes(df, dtypes):
97124
bad_dates.append(c)
98125
else:
99126
try:
100-
df[c] = df[c].astype(dtypes[c])
127+
if is_categorical_dtype(df[c]):
128+
dtype = _union_categorical_dtypes(df[c].cat.categories, dtypes[c])
129+
df[c] = df[c].astype(dtype)
130+
else:
131+
df[c] = df[c].astype(dtypes[c])
101132
except Exception as e:
102133
bad_dtypes.append((c, actual, desired))
103134
errors.append((c, e))

dask/dataframe/io/tests/test_csv.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,40 @@ def test_categorical_dtypes():
384384
['apple', 'banana', 'orange', 'pear'])
385385

386386

387+
@pytest.mark.skipif(PANDAS_VERSION < '0.20.0',
388+
reason="Uses CategoricalDtype")
389+
def test_categorical_ordered():
390+
text1 = normalize_text("""
391+
A
392+
a
393+
b
394+
a
395+
""")
396+
text2 = normalize_text("""
397+
A
398+
a
399+
b
400+
c
401+
""")
402+
dtype = pd.api.types.CategoricalDtype(['a', 'b', 'c'])
403+
with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}):
404+
result = dd.read_csv("foo.*.csv", dtype={"A": 'category'})
405+
expected = pd.DataFrame({
406+
"A": pd.Categorical(['a', 'b', 'a', 'a', 'b', 'c'],
407+
categories=dtype.categories)},
408+
index=[0, 1, 2, 0, 1, 2])
409+
assert_eq(result, expected)
410+
411+
result = dd.read_csv("foo.*.csv", dtype=dtype)
412+
assert_eq(result, expected)
413+
414+
# ordered
415+
dtype = pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=True)
416+
result = dd.read_csv("foo.*.csv", dtype=dtype)
417+
expected['A'] = expected['A'].cat.as_ordered()
418+
assert_eq(result, expected)
419+
420+
387421
@pytest.mark.slow
388422
def test_compression_multiple_files():
389423
with tmpdir() as tdir:

docs/source/changelog.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ bounds indexes (:pr:`2967`) `Stephan Hoyer`_
2424
DataFrame
2525
+++++++++
2626

27+
- Compatability with pandas 0.22.0 (:issue:`2996`) `Tom Augspurger`_
2728
- Prevent ``bool()`` coercion from calling compute (:pr:`2958`) `Albert DeFusco`_
2829
- ``DataFrame.read_sql()`` (:pr:`2928`) to an empty database tables returns an empty dask dataframe `Apostolos Vlachopoulos`_
2930
- Fixed ``dd.concat`` losing the index dtype when the data contained a categorical (:issue:`2932`) `Tom Augspurger`_

0 commit comments

Comments
 (0)