diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cb6f200b259db..4ed0faef398ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -101,7 +101,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): Parameters ---------- - values : ndarray (1-d) + values : ndarray Sequence sort : boolean, default False Sort by values @@ -129,7 +129,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): table = hash_klass(len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel) + labels = table.get_labels( + vals.ravel(), uniques, 0, na_sentinel).reshape(vals.shape) labels = com._ensure_platform_int(labels) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d049a6d64aac3..04d395e35c4f8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -269,10 +269,23 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, self.levels = levels self.name = name + def _replace_codes(self, codes): + """ + Returns a new Categorical with replaced codes but the same levels and + metadata + + If codes is a scalar, just return that level. + """ + codes = np.asarray(codes) + if np.isscalar(codes): + return self.levels[codes] + else: + return Categorical(codes, levels=self.levels, ordered=self.ordered, + name=self.name, fastpath=True) + def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(),levels=self.levels, - name=self.name, ordered=self.ordered, fastpath=True) + return self._replace_codes(self._codes.copy()) @classmethod def from_array(cls, data): @@ -431,8 +444,19 @@ def shape(self): ------- shape : tuple """ + return self._codes.shape - return tuple([len(self._codes)]) + @property + def size(self): + """Size of the Categorical. + + For internal compatibility with numpy arrays. + + Returns + ------- + size : int + """ + return self._codes.size def __array__(self, dtype=None): """ The numpy array interface. @@ -442,11 +466,12 @@ def __array__(self, dtype=None): values : numpy array A numpy array of the same dtype as categorical.levels.dtype """ - return com.take_1d(self.levels.values, self._codes) + return com.take_1d( + self.levels.values, self._codes.ravel()).reshape(self.shape) @property def T(self): - return self + return self._replace_codes(self._codes.T) def get_values(self): """ Return the values. @@ -558,7 +583,7 @@ def ravel(self, order='C'): ------- raveled : numpy array """ - return np.array(self) + return np.array(self._replace_codes(self._codes.ravel(order=order))) def view(self): """Return a view of myself. @@ -628,9 +653,8 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): if allow_fill and fill_value is None: fill_value = np.nan - values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = Categorical(values=values, levels=self.levels, ordered=self.ordered, - name=self.name, fastpath=True) + codes = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) + result = self._replace_codes(codes) return result take = take_nd @@ -646,8 +670,7 @@ def _slice(self, slicer): slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes,levels=self.levels, ordered=self.ordered, - name=self.name, fastpath=True) + return self._replace_codes(_codes) def __len__(self): return len(self._codes) @@ -738,15 +761,11 @@ def __unicode__(self): def __getitem__(self, key): """ Return an item. """ - if isinstance(key, (int, np.integer)): - i = self._codes[key] - if i == -1: - return np.nan - else: - return self.levels[i] - else: - return Categorical(values=self._codes[key], levels=self.levels, - ordered=self.ordered, fastpath=True) + return self._replace_codes(self._codes[key]) + # if np.isscalar(codes): + # return self.levels[codes] + # else: + # return self._replace_codes(codes) def __setitem__(self, key, value): """ Item assignment. @@ -760,40 +779,22 @@ def __setitem__(self, key, value): """ - # require identical level set if isinstance(value, Categorical): + # require identical level set if not value.levels.equals(self.levels): raise ValueError("cannot set a Categorical with another, without identical levels") - - rvalue = value if com.is_list_like(value) else [value] - to_add = Index(rvalue)-self.levels - if len(to_add): - raise ValueError("cannot setitem on a Categorical with a new level," - " set the levels first") - - # set by position - if isinstance(key, (int, np.integer)): - pass - - # tuple of indexers - elif isinstance(key, tuple): - - # only allow 1 dimensional slicing, but can - # in a 2-d case be passd (slice(None),....) - if len(key) == 2: - if not _is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") - key = key[1] - elif len(key) == 1: - key = key[0] - else: - raise AssertionError("invalid slicing for a 1-ndim categorical") + # we can safely assign codes directly + self._codes[key] = value.codes else: - key = self._codes[key] - - lindexer = self.levels.get_indexer(rvalue) - self._codes[key] = lindexer + value = np.asarray(value) + flat_value = value.ravel() + to_add = Index(flat_value) - self.levels + if len(to_add): + raise ValueError("cannot setitem on a Categorical with a new level," + " set the levels first") + lindexer = self.levels.get_indexer(flat_value) + self._codes[key] = lindexer.reshape(value.shape) #### reduction ops #### def _reduce(self, op, axis=0, skipna=True, numeric_only=None, @@ -871,9 +872,8 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 - result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), - levels=self.levels,ordered=self.ordered, name=self.name, - fastpath=True) + result = self._replace_codes( + sorted(htable.mode_int64(com._ensure_int64(self._codes[good])))) return result def unique(self): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f649baeb16278..4bbdc789922e3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1649,18 +1649,6 @@ def _validate_merge(self, blocks): return True - def to_native_types(self, slicer=None, na_rep='', **kwargs): - """ convert to our native types format, slicing if desired """ - - values = self.values - if slicer is not None: - # Categorical is always one dimension - values = values[slicer] - values = np.array(values, dtype=object) - mask = isnull(values) - values[mask] = na_rep - # Blocks.to_native_type returns list of lists, but we are always only a list - return [values.tolist()] class DatetimeBlock(Block): __slots__ = () diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6353ad53a88ef..c11f4ae38f242 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -95,6 +95,11 @@ def test_mixed(self): self.assert_numpy_array_equal(labels, np.array([ 2, 2, -1, 3, 0, 1],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object)) + def test_multidimensional(self): + labels, uniques = algos.factorize([['a', 'b'], ['a', 'c']]) + self.assert_numpy_array_equal(labels, np.array([[0, 1], [0, 2]], dtype=np.int64)) + self.assert_numpy_array_equal(uniques, np.array(['a', 'b', 'c'], dtype=object)) + def test_datelike(self): # M8 diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 0aa7f2b67c7c6..d933bdf0ae454 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -222,7 +222,8 @@ def test_print(self): self.assertEqual(actual, expected) def test_big_print(self): - factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True) + factor = Categorical(np.array([0,1,2,0,1,2]*100), ['a', 'b', 'c'], + name='cat', fastpath=True) expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", "...", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", @@ -496,6 +497,26 @@ def test_slicing_directly(self): self.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.levels, expected.levels) + def test_ndimensional_values(self): + exp_arr = np.array([['a', 'b'], ['c', 'b']], dtype=object) + cat = Categorical(exp_arr) + + self.assertEqual(cat.shape, (2, 2)) + self.assert_numpy_array_equal(cat.__array__(), exp_arr) + self.assert_numpy_array_equal(cat.T, exp_arr.T) + self.assert_numpy_array_equal(cat.ravel(), exp_arr.ravel()) + + # test indexing + self.assertEqual(cat[0, 0], 'a') + self.assert_numpy_array_equal(cat[0], exp_arr[0]) + self.assert_numpy_array_equal(cat[:, :2], exp_arr) + self.assert_numpy_array_equal(cat[[0, 1], [0, 1]], np.diag(exp_arr)) + self.assert_numpy_array_equal(cat[0, :], ['a', 'b']) + self.assert_numpy_array_equal(cat[0, [0, 1]], ['a', 'b']) + + # TODO: repr, __setitem__, take, min, max, order, describe, _cat_compare_op + + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True