cut back to returning categorical

jreback · jreback · commit ec0b97b1cf8b · 2017-02-05T11:29:41.000-05:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -402,29 +402,22 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
         try:
             from pandas.tools.tile import cut
             values = Series(values)
-            ii, _, lev = cut(values, bins, retbins=True, include_lowest=True)
+            ii = cut(values, bins, include_lowest=True)
         except TypeError:
             raise TypeError("bins argument only works with numeric data.")
 
-        # if normalizing, we need the total (include NA's)
-        counts = np.array([len(ii)])
+        # count, remove nulls (from the index), and but the bins
+        result = ii.value_counts(dropna=dropna)
+        result = result[result.index.notnull()]
+        result.index = result.index.astype('interval')
+        result = result.sort_index()
 
-        # remove NaN ii entries
-        if dropna:
-            mask = ii.notnull()
-            values = values[mask]
-            ii = ii[mask]
-
-        result = values.groupby(ii).count()
-
-        # reindex & fill in 0's for non-represented levels
-        # but don't if we have completely dropped everything
-        # as its now a missing level
-        # this matches our groupby.value_counts behavior
-        if dropna and not len(values) and not len(result):
-            result.index = lev[0:0]
-        else:
-            result = result.reindex(lev).fillna(0).astype('i8')
+        # if we are dropna and we have NO values
+        if dropna and (result.values == 0).all():
+            result = result.iloc[0:0]
+
+        # normalizing is by len of all (regarless of dropna)
+        counts = np.array([len(ii)])
 
     else:
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3068,14 +3068,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
             llab = lambda lab, inc: lab[inc]
         else:
 
-            # lab is an IntervalIndex
-            # we get our last level of labels from the
-            # II indexer
-            # TODO: make this a method on II
-            lab, _, lev = cut(val, bins, retbins=True, include_lowest=True)
-
-            # we compute the levels here rather than use the bins
-            # because we may have adjusted them with include_lowest
+            # lab is a Categorical with categories an IntervalIndex
+            lab = cut(Series(val), bins, include_lowest=True)
+            lev = lab.cat.categories
+            lab = lev.take(lab.cat.codes)
             llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
 
         if is_interval_dtype(lab):
diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -7,6 +7,7 @@
 from pandas.types.common import (is_categorical_dtype,
                                  _ensure_platform_int,
                                  is_list_like,
+                                 is_interval_dtype,
                                  is_scalar)
 from pandas.types.missing import array_equivalent
 
@@ -266,6 +267,13 @@ def __array__(self, dtype=None):
         """ the array interface, return my values """
         return np.array(self._data, dtype=dtype)
 
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
+        if is_interval_dtype(dtype):
+            from pandas import IntervalIndex
+            return IntervalIndex.from_intervals(np.array(self))
+        return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
+
     @cache_readonly
     def _isnan(self):
         """ return if each value is nan"""
@@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
                                            na_value=-1)
         return self._create_from_codes(taken)
 
+    take_nd = take
+
     def map(self, mapper):
         """Apply mapper function to its categories (not codes).
 
diff --git a/pandas/indexes/interval.py b/pandas/indexes/interval.py
@@ -1,16 +1,18 @@
 """ define the IntervalIndex """
 
 import numpy as np
-import pandas as pd
 
 from pandas.types.missing import notnull, isnull
 from pandas.types.common import (_ensure_platform_int,
                                  is_datetime_or_timedelta_dtype,
                                  is_integer_dtype,
+                                 is_object_dtype,
+                                 is_categorical_dtype,
                                  is_float_dtype,
                                  is_interval_dtype)
 from pandas.indexes.base import (Index, _ensure_index,
                                  default_pprint, _index_shared_docs)
+from pandas.tslib import Timestamp, Timedelta
 from pandas.indexes.multi import MultiIndex
 from pandas.compat.numpy import function as nv
 from pandas.core import common as com
@@ -24,7 +26,7 @@
 
 def _get_next_label(label):
     dtype = getattr(label, 'dtype', type(label))
-    if isinstance(label, (pd.Timestamp, pd.Timedelta)):
+    if isinstance(label, (Timestamp, Timedelta)):
         dtype = 'datetime64'
     if is_datetime_or_timedelta_dtype(dtype):
         return label + np.timedelta64(1, 'ns')
@@ -39,7 +41,7 @@ def _get_next_label(label):
 
 def _get_prev_label(label):
     dtype = getattr(label, 'dtype', type(label))
-    if isinstance(label, (pd.Timestamp, pd.Timedelta)):
+    if isinstance(label, (Timestamp, Timedelta)):
         dtype = 'datetime64'
     if is_datetime_or_timedelta_dtype(dtype):
         return label - np.timedelta64(1, 'ns')
@@ -340,6 +342,19 @@ def copy(self, deep=False, name=None):
         name = name if name is not None else self.name
         return self._shallow_copy(left, right, name=name)
 
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
+        if is_interval_dtype(dtype):
+            if copy:
+                self = self.copy()
+            return self
+        elif is_object_dtype(dtype):
+            return Index(self.values, dtype=object)
+        elif is_categorical_dtype(dtype):
+            from pandas import Categorical
+            return Categorical(self, ordered=True)
+        raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
+
     @cache_readonly
     def dtype(self):
         return np.dtype('O')
@@ -513,6 +528,26 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             else:
                 return self._tree.get_indexer(target)
 
+    def sort_values(self, return_indexer=False, ascending=True):
+        """
+        Return sorted copy of Index
+        """
+        mask = self._mask
+
+        # nans are sorted to the highest values
+        _as = self.argsort()
+        _as[mask] = -1
+
+        if not ascending:
+            _as = _as[::-1]
+
+        sorted_index = self.take(_as)
+
+        if return_indexer:
+            return sorted_index, _as
+        else:
+            return sorted_index
+
     def where(self, cond, other=None):
         raise NotImplementedError
 
diff --git a/pandas/src/interval.pyx b/pandas/src/interval.pyx
@@ -161,6 +161,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals):
         interval = intervals[i]
         if util._checknull(interval):
             mask[i] = 1
+            left[i] = np.nan
+            right[i] = np.nan
             continue
 
         if not isinstance(interval, Interval):
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 
-from pandas import Categorical, compat, notnull
+from pandas import Categorical, IntervalIndex, compat, notnull
 from pandas.util.testing import assert_almost_equal
 import pandas.core.config as cf
 import pandas as pd
@@ -338,6 +338,20 @@ def test_astype(self):
         self.assertIsInstance(result, Index)
         self.assertNotIsInstance(result, CategoricalIndex)
 
+        # interval
+        ii = IntervalIndex(left=[-0.001, 2.0],
+                           right=[2, 4],
+                           closed='right')
+
+        ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True))
+
+        result = ci.astype('interval')
+        expected = ii.take([0, 1, -1])
+        tm.assert_index_equal(result, expected)
+
+        result = IntervalIndex.from_intervals(result.values)
+        tm.assert_index_equal(result, expected)
+
     def test_reindex_base(self):
 
         # determined by cat ordering
diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py
@@ -164,6 +164,27 @@ def test_equals(self):
         self.assertFalse(idx.equals(
             pd.date_range('20130101', periods=2)))
 
+    def test_astype(self):
+
+        idx = self.index
+
+        for dtype in [np.int64, np.float64, 'datetime64[ns]',
+                      'datetime64[ns, US/Eastern]', 'timedelta64',
+                      'period[M]']:
+            self.assertRaises(ValueError, idx.astype, dtype)
+
+        result = idx.astype(object)
+        tm.assert_index_equal(result, Index(idx.values, dtype='object'))
+        self.assertTrue(idx.equals(result))
+
+        result = idx.astype('interval')
+        tm.assert_index_equal(result, idx)
+        self.assertTrue(result.equals(idx))
+
+        result = idx.astype('category')
+        expected = pd.Categorical(idx, ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
     def test_where(self):
         self.assertRaises(NotImplementedError,
                           self.index.where,
@@ -474,6 +495,15 @@ def test_sort_values(self):
                                             (2, 3)]).sort_values()
         tm.assert_index_equal(expected, actual)
 
+        # nan
+        idx = self.index_with_nan
+        mask = idx.isnull()
+        self.assert_numpy_array_equal(mask, np.array([False, True, False]))
+
+        result = idx.sort_values()
+        mask = result.isnull()
+        self.assert_numpy_array_equal(mask, np.array([False, True, False]))
+
     def test_datetime(self):
         dates = pd.date_range('2000', periods=3)
         idx = IntervalIndex.from_breaks(dates)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -485,7 +485,8 @@ def test_value_counts(self):
         # tm.assertIsInstance(factor, n)
         result = algos.value_counts(factor)
         breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
-        expected_index = pd.IntervalIndex.from_breaks(breaks)
+        expected_index = pd.IntervalIndex.from_breaks(
+            breaks).astype('category')
         expected = Series([1, 1, 1, 1],
                           index=expected_index)
         tm.assert_series_equal(result.sort_index(), expected.sort_index())
diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py
@@ -364,6 +364,7 @@ def setUp(self):
 
     def test_is_dtype(self):
 
+        self.assertTrue(is_interval_dtype('interval'))
         self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])))
         self.assertTrue(is_interval_dtype
                         (IntervalIndex.from_breaks(np.arange(4))))
diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
diff --git a/pandas/types/common.py b/pandas/types/common.py