Skip to content

Commit ec0b97b

Browse files
committed
cut back to returning categorical
1 parent 37e0b98 commit ec0b97b

File tree

12 files changed

+193
-101
lines changed

12 files changed

+193
-101
lines changed

pandas/core/algorithms.py

+12-19
Original file line numberDiff line numberDiff line change
@@ -402,29 +402,22 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
402402
try:
403403
from pandas.tools.tile import cut
404404
values = Series(values)
405-
ii, _, lev = cut(values, bins, retbins=True, include_lowest=True)
405+
ii = cut(values, bins, include_lowest=True)
406406
except TypeError:
407407
raise TypeError("bins argument only works with numeric data.")
408408

409-
# if normalizing, we need the total (include NA's)
410-
counts = np.array([len(ii)])
409+
# count, remove nulls (from the index), and but the bins
410+
result = ii.value_counts(dropna=dropna)
411+
result = result[result.index.notnull()]
412+
result.index = result.index.astype('interval')
413+
result = result.sort_index()
411414

412-
# remove NaN ii entries
413-
if dropna:
414-
mask = ii.notnull()
415-
values = values[mask]
416-
ii = ii[mask]
417-
418-
result = values.groupby(ii).count()
419-
420-
# reindex & fill in 0's for non-represented levels
421-
# but don't if we have completely dropped everything
422-
# as its now a missing level
423-
# this matches our groupby.value_counts behavior
424-
if dropna and not len(values) and not len(result):
425-
result.index = lev[0:0]
426-
else:
427-
result = result.reindex(lev).fillna(0).astype('i8')
415+
# if we are dropna and we have NO values
416+
if dropna and (result.values == 0).all():
417+
result = result.iloc[0:0]
418+
419+
# normalizing is by len of all (regarless of dropna)
420+
counts = np.array([len(ii)])
428421

429422
else:
430423

pandas/core/groupby.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -3068,14 +3068,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
30683068
llab = lambda lab, inc: lab[inc]
30693069
else:
30703070

3071-
# lab is an IntervalIndex
3072-
# we get our last level of labels from the
3073-
# II indexer
3074-
# TODO: make this a method on II
3075-
lab, _, lev = cut(val, bins, retbins=True, include_lowest=True)
3076-
3077-
# we compute the levels here rather than use the bins
3078-
# because we may have adjusted them with include_lowest
3071+
# lab is a Categorical with categories an IntervalIndex
3072+
lab = cut(Series(val), bins, include_lowest=True)
3073+
lev = lab.cat.categories
3074+
lab = lev.take(lab.cat.codes)
30793075
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
30803076

30813077
if is_interval_dtype(lab):

pandas/indexes/category.py

+10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas.types.common import (is_categorical_dtype,
88
_ensure_platform_int,
99
is_list_like,
10+
is_interval_dtype,
1011
is_scalar)
1112
from pandas.types.missing import array_equivalent
1213

@@ -266,6 +267,13 @@ def __array__(self, dtype=None):
266267
""" the array interface, return my values """
267268
return np.array(self._data, dtype=dtype)
268269

270+
@Appender(_index_shared_docs['astype'])
271+
def astype(self, dtype, copy=True):
272+
if is_interval_dtype(dtype):
273+
from pandas import IntervalIndex
274+
return IntervalIndex.from_intervals(np.array(self))
275+
return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
276+
269277
@cache_readonly
270278
def _isnan(self):
271279
""" return if each value is nan"""
@@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
508516
na_value=-1)
509517
return self._create_from_codes(taken)
510518

519+
take_nd = take
520+
511521
def map(self, mapper):
512522
"""Apply mapper function to its categories (not codes).
513523

pandas/indexes/interval.py

+38-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
""" define the IntervalIndex """
22

33
import numpy as np
4-
import pandas as pd
54

65
from pandas.types.missing import notnull, isnull
76
from pandas.types.common import (_ensure_platform_int,
87
is_datetime_or_timedelta_dtype,
98
is_integer_dtype,
9+
is_object_dtype,
10+
is_categorical_dtype,
1011
is_float_dtype,
1112
is_interval_dtype)
1213
from pandas.indexes.base import (Index, _ensure_index,
1314
default_pprint, _index_shared_docs)
15+
from pandas.tslib import Timestamp, Timedelta
1416
from pandas.indexes.multi import MultiIndex
1517
from pandas.compat.numpy import function as nv
1618
from pandas.core import common as com
@@ -24,7 +26,7 @@
2426

2527
def _get_next_label(label):
2628
dtype = getattr(label, 'dtype', type(label))
27-
if isinstance(label, (pd.Timestamp, pd.Timedelta)):
29+
if isinstance(label, (Timestamp, Timedelta)):
2830
dtype = 'datetime64'
2931
if is_datetime_or_timedelta_dtype(dtype):
3032
return label + np.timedelta64(1, 'ns')
@@ -39,7 +41,7 @@ def _get_next_label(label):
3941

4042
def _get_prev_label(label):
4143
dtype = getattr(label, 'dtype', type(label))
42-
if isinstance(label, (pd.Timestamp, pd.Timedelta)):
44+
if isinstance(label, (Timestamp, Timedelta)):
4345
dtype = 'datetime64'
4446
if is_datetime_or_timedelta_dtype(dtype):
4547
return label - np.timedelta64(1, 'ns')
@@ -340,6 +342,19 @@ def copy(self, deep=False, name=None):
340342
name = name if name is not None else self.name
341343
return self._shallow_copy(left, right, name=name)
342344

345+
@Appender(_index_shared_docs['astype'])
346+
def astype(self, dtype, copy=True):
347+
if is_interval_dtype(dtype):
348+
if copy:
349+
self = self.copy()
350+
return self
351+
elif is_object_dtype(dtype):
352+
return Index(self.values, dtype=object)
353+
elif is_categorical_dtype(dtype):
354+
from pandas import Categorical
355+
return Categorical(self, ordered=True)
356+
raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
357+
343358
@cache_readonly
344359
def dtype(self):
345360
return np.dtype('O')
@@ -513,6 +528,26 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
513528
else:
514529
return self._tree.get_indexer(target)
515530

531+
def sort_values(self, return_indexer=False, ascending=True):
532+
"""
533+
Return sorted copy of Index
534+
"""
535+
mask = self._mask
536+
537+
# nans are sorted to the highest values
538+
_as = self.argsort()
539+
_as[mask] = -1
540+
541+
if not ascending:
542+
_as = _as[::-1]
543+
544+
sorted_index = self.take(_as)
545+
546+
if return_indexer:
547+
return sorted_index, _as
548+
else:
549+
return sorted_index
550+
516551
def where(self, cond, other=None):
517552
raise NotImplementedError
518553

pandas/src/interval.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals):
161161
interval = intervals[i]
162162
if util._checknull(interval):
163163
mask[i] = 1
164+
left[i] = np.nan
165+
right[i] = np.nan
164166
continue
165167

166168
if not isinstance(interval, Interval):

pandas/tests/indexes/test_category.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import numpy as np
1313

14-
from pandas import Categorical, compat, notnull
14+
from pandas import Categorical, IntervalIndex, compat, notnull
1515
from pandas.util.testing import assert_almost_equal
1616
import pandas.core.config as cf
1717
import pandas as pd
@@ -338,6 +338,20 @@ def test_astype(self):
338338
self.assertIsInstance(result, Index)
339339
self.assertNotIsInstance(result, CategoricalIndex)
340340

341+
# interval
342+
ii = IntervalIndex(left=[-0.001, 2.0],
343+
right=[2, 4],
344+
closed='right')
345+
346+
ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True))
347+
348+
result = ci.astype('interval')
349+
expected = ii.take([0, 1, -1])
350+
tm.assert_index_equal(result, expected)
351+
352+
result = IntervalIndex.from_intervals(result.values)
353+
tm.assert_index_equal(result, expected)
354+
341355
def test_reindex_base(self):
342356

343357
# determined by cat ordering

pandas/tests/indexes/test_interval.py

+30
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,27 @@ def test_equals(self):
164164
self.assertFalse(idx.equals(
165165
pd.date_range('20130101', periods=2)))
166166

167+
def test_astype(self):
168+
169+
idx = self.index
170+
171+
for dtype in [np.int64, np.float64, 'datetime64[ns]',
172+
'datetime64[ns, US/Eastern]', 'timedelta64',
173+
'period[M]']:
174+
self.assertRaises(ValueError, idx.astype, dtype)
175+
176+
result = idx.astype(object)
177+
tm.assert_index_equal(result, Index(idx.values, dtype='object'))
178+
self.assertTrue(idx.equals(result))
179+
180+
result = idx.astype('interval')
181+
tm.assert_index_equal(result, idx)
182+
self.assertTrue(result.equals(idx))
183+
184+
result = idx.astype('category')
185+
expected = pd.Categorical(idx, ordered=True)
186+
tm.assert_categorical_equal(result, expected)
187+
167188
def test_where(self):
168189
self.assertRaises(NotImplementedError,
169190
self.index.where,
@@ -474,6 +495,15 @@ def test_sort_values(self):
474495
(2, 3)]).sort_values()
475496
tm.assert_index_equal(expected, actual)
476497

498+
# nan
499+
idx = self.index_with_nan
500+
mask = idx.isnull()
501+
self.assert_numpy_array_equal(mask, np.array([False, True, False]))
502+
503+
result = idx.sort_values()
504+
mask = result.isnull()
505+
self.assert_numpy_array_equal(mask, np.array([False, True, False]))
506+
477507
def test_datetime(self):
478508
dates = pd.date_range('2000', periods=3)
479509
idx = IntervalIndex.from_breaks(dates)

pandas/tests/test_algos.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,8 @@ def test_value_counts(self):
485485
# tm.assertIsInstance(factor, n)
486486
result = algos.value_counts(factor)
487487
breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
488-
expected_index = pd.IntervalIndex.from_breaks(breaks)
488+
expected_index = pd.IntervalIndex.from_breaks(
489+
breaks).astype('category')
489490
expected = Series([1, 1, 1, 1],
490491
index=expected_index)
491492
tm.assert_series_equal(result.sort_index(), expected.sort_index())

pandas/tests/types/test_dtypes.py

+1
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ def setUp(self):
364364

365365
def test_is_dtype(self):
366366

367+
self.assertTrue(is_interval_dtype('interval'))
367368
self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])))
368369
self.assertTrue(is_interval_dtype
369370
(IntervalIndex.from_breaks(np.arange(4))))

0 commit comments

Comments
 (0)