Skip to content

Commit 5a3b071

Browse files
committed
BUG: union_categoricals can't handle NaN
- [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` - [x] whatsnew not needed ``union_categoricals`` doesn't handle ``NaN`` properly. **on current master:** ``` from pandas.types.concat import union_categoricals union_categoricals([pd.Categorical([np.nan, 1]), pd.Categorical([2, np.nan])]) # [1, 1, 2, 2] # Categories (2, int64): [1, 2] union_categoricals([pd.Categorical([np.nan]), pd.Categorical([np.nan])]) # IndexError: cannot do a non-empty take from an empty axes. ``` Author: sinhrks <[email protected]> Closes #13759 from sinhrks/union_categoricals_nan and squashes the following commits: 4312a32 [sinhrks] BUG: union_categoricals can't handle NaN
1 parent e533947 commit 5a3b071

File tree

2 files changed

+63
-4
lines changed

2 files changed

+63
-4
lines changed

pandas/tools/tests/test_concat.py

+53
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,59 @@ def test_union_categorical(self):
889889
with tm.assertRaises(ValueError):
890890
union_categoricals([])
891891

892+
def test_union_categoricals_nan(self):
893+
# GH 13759
894+
res = union_categoricals([pd.Categorical([1, 2, np.nan]),
895+
pd.Categorical([3, 2, np.nan])])
896+
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
897+
tm.assert_categorical_equal(res, exp)
898+
899+
res = union_categoricals([pd.Categorical(['A', 'B']),
900+
pd.Categorical(['B', 'B', np.nan])])
901+
exp = Categorical(['A', 'B', 'B', 'B', np.nan])
902+
tm.assert_categorical_equal(res, exp)
903+
904+
val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'),
905+
pd.NaT]
906+
val2 = [pd.NaT, pd.Timestamp('2011-01-01'),
907+
pd.Timestamp('2011-02-01')]
908+
909+
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
910+
exp = Categorical(val1 + val2,
911+
categories=[pd.Timestamp('2011-01-01'),
912+
pd.Timestamp('2011-03-01'),
913+
pd.Timestamp('2011-02-01')])
914+
tm.assert_categorical_equal(res, exp)
915+
916+
# all NaN
917+
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
918+
pd.Categorical(['X'])])
919+
exp = Categorical([np.nan, np.nan, 'X'])
920+
tm.assert_categorical_equal(res, exp)
921+
922+
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
923+
pd.Categorical([np.nan, np.nan])])
924+
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
925+
tm.assert_categorical_equal(res, exp)
926+
927+
def test_union_categoricals_empty(self):
928+
# GH 13759
929+
res = union_categoricals([pd.Categorical([]),
930+
pd.Categorical([])])
931+
exp = Categorical([])
932+
tm.assert_categorical_equal(res, exp)
933+
934+
res = union_categoricals([pd.Categorical([]),
935+
pd.Categorical([1.0])])
936+
exp = Categorical([1.0])
937+
tm.assert_categorical_equal(res, exp)
938+
939+
# to make dtype equal
940+
nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
941+
res = union_categoricals([nanc,
942+
pd.Categorical([])])
943+
tm.assert_categorical_equal(res, nanc)
944+
892945
def test_concat_bug_1719(self):
893946
ts1 = tm.makeTimeSeries()
894947
ts2 = tm.makeTimeSeries()[::2]

pandas/types/concat.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas.tslib as tslib
77
from pandas import compat
88
from pandas.compat import map
9+
from pandas.core.algorithms import take_1d
910
from .common import (is_categorical_dtype,
1011
is_sparse,
1112
is_datetimetz,
@@ -254,10 +255,15 @@ def union_categoricals(to_union):
254255

255256
new_codes = []
256257
for c in to_union:
257-
indexer = categories.get_indexer(c.categories)
258-
new_codes.append(indexer.take(c.codes))
259-
codes = np.concatenate(new_codes)
260-
return Categorical(codes, categories=categories, ordered=False,
258+
if len(c.categories) > 0:
259+
indexer = categories.get_indexer(c.categories)
260+
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
261+
else:
262+
# must be all NaN
263+
new_codes.append(c.codes)
264+
265+
new_codes = np.concatenate(new_codes)
266+
return Categorical(new_codes, categories=categories, ordered=False,
261267
fastpath=True)
262268

263269

0 commit comments

Comments
 (0)