diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index c219818a62631..feccc19d8f70b 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -163,6 +163,8 @@ Bug Fixes - Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) - Bug in ``SparseSeries`` constructor ignores input data name (:issue:`10258`) +- Bug in ``Categorical.remove_categories`` causing a ValueError when removing the ``NaN`` category if underlying dtype is floating-point (:issue:`10156`) + - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) - Bug in ``DataFrame.to_hdf()`` where table format would raise a seemingly unrelated error for invalid (non-string) column names. This is now explicitly forbidden. (:issue:`9057`) - Bug to handle masking empty ``DataFrame``(:issue:`10126`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c5cd8390359dc..74007d0127e4f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -749,11 +749,19 @@ def remove_categories(self, removals, inplace=False): """ if not is_list_like(removals): removals = [removals] - removals = set(list(removals)) - not_included = removals - set(self._categories) + + removal_set = set(list(removals)) + not_included = removal_set - set(self._categories) + new_categories = [ c for c in self._categories if c not in removal_set ] + + # GH 10156 + if any(isnull(removals)): + not_included = [x for x in not_included if notnull(x)] + new_categories = [x for x in new_categories if notnull(x)] + if len(not_included) != 0: raise ValueError("removals must all be in old categories: %s" % str(not_included)) - new_categories = [ c for c in self._categories if c not in removals ] + return self.set_categories(new_categories, ordered=self.ordered, rename=False, inplace=inplace) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index bec688db99114..bc9279a8d1529 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -854,6 +854,28 @@ def test_nan_handling(self): self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,2,-1,0])) + # Remove null categories (GH 10156) + cases = [ + ([1.0, 2.0, np.nan], [1.0, 2.0]), + (['a', 'b', None], ['a', 'b']), + ([pd.Timestamp('2012-05-01'), pd.NaT], [pd.Timestamp('2012-05-01')]) + ] + + null_values = [np.nan, None, pd.NaT] + + for with_null, without in cases: + base = Categorical([], with_null) + expected = Categorical([], without) + + for nullval in null_values: + result = base.remove_categories(nullval) + self.assert_categorical_equal(result, expected) + + # Different null values are indistinguishable + for i, j in [(0, 1), (0, 2), (1, 2)]: + nulls = [null_values[i], null_values[j]] + self.assertRaises(ValueError, lambda: Categorical([], categories=nulls)) + def test_isnull(self): exp = np.array([False, False, True])