Skip to content

WIP: generalize categorical to N-dimensions #8012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):

Parameters
----------
values : ndarray (1-d)
values : ndarray
Sequence
sort : boolean, default False
Sort by values
Expand Down Expand Up @@ -129,7 +129,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):

table = hash_klass(len(vals))
uniques = vec_klass()
labels = table.get_labels(vals, uniques, 0, na_sentinel)
labels = table.get_labels(
vals.ravel(), uniques, 0, na_sentinel).reshape(vals.shape)

labels = com._ensure_platform_int(labels)

Expand Down
104 changes: 52 additions & 52 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,23 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False,
self.levels = levels
self.name = name

def _replace_codes(self, codes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is is more used like a _constructor(...).

"""
Returns a new Categorical with replaced codes but the same levels and
metadata

If codes is a scalar, just return that level.
"""
codes = np.asarray(codes)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be a problem for Categoricals with only one value (len(self._codes) == 1). Using then a method which calls _replace_codes will result in a different type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a test to make sure but I don't think so. A 1d array or list with one element is not a scalar.

if np.isscalar(codes):
return self.levels[codes]
else:
return Categorical(codes, levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)

def copy(self):
""" Copy constructor. """
return Categorical(values=self._codes.copy(),levels=self.levels,
name=self.name, ordered=self.ordered, fastpath=True)
return self._replace_codes(self._codes.copy())

@classmethod
def from_array(cls, data):
Expand Down Expand Up @@ -431,8 +444,19 @@ def shape(self):
-------
shape : tuple
"""
return self._codes.shape

return tuple([len(self._codes)])
@property
def size(self):
"""Size of the Categorical.

For internal compatibility with numpy arrays.

Returns
-------
size : int
"""
return self._codes.size

def __array__(self, dtype=None):
""" The numpy array interface.
Expand All @@ -442,11 +466,12 @@ def __array__(self, dtype=None):
values : numpy array
A numpy array of the same dtype as categorical.levels.dtype
"""
return com.take_1d(self.levels.values, self._codes)
return com.take_1d(
self.levels.values, self._codes.ravel()).reshape(self.shape)

@property
def T(self):
return self
return self._replace_codes(self._codes.T)

def get_values(self):
""" Return the values.
Expand Down Expand Up @@ -558,7 +583,7 @@ def ravel(self, order='C'):
-------
raveled : numpy array
"""
return np.array(self)
return np.array(self._replace_codes(self._codes.ravel(order=order)))

def view(self):
"""Return a view of myself.
Expand Down Expand Up @@ -628,9 +653,8 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None):
if allow_fill and fill_value is None:
fill_value = np.nan

values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)
codes = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
result = self._replace_codes(codes)
return result

take = take_nd
Expand All @@ -646,8 +670,7 @@ def _slice(self, slicer):
slicer = slicer[1]

_codes = self._codes[slicer]
return Categorical(values=_codes,levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)
return self._replace_codes(_codes)

def __len__(self):
return len(self._codes)
Expand Down Expand Up @@ -738,15 +761,11 @@ def __unicode__(self):

def __getitem__(self, key):
""" Return an item. """
if isinstance(key, (int, np.integer)):
i = self._codes[key]
if i == -1:
return np.nan
else:
return self.levels[i]
else:
return Categorical(values=self._codes[key], levels=self.levels,
ordered=self.ordered, fastpath=True)
return self._replace_codes(self._codes[key])
# if np.isscalar(codes):
# return self.levels[codes]
# else:
# return self._replace_codes(codes)

def __setitem__(self, key, value):
""" Item assignment.
Expand All @@ -760,40 +779,22 @@ def __setitem__(self, key, value):

"""

# require identical level set
if isinstance(value, Categorical):
# require identical level set
if not value.levels.equals(self.levels):
raise ValueError("cannot set a Categorical with another, without identical levels")

rvalue = value if com.is_list_like(value) else [value]
to_add = Index(rvalue)-self.levels
if len(to_add):
raise ValueError("cannot setitem on a Categorical with a new level,"
" set the levels first")

# set by position
if isinstance(key, (int, np.integer)):
pass

# tuple of indexers
elif isinstance(key, tuple):

# only allow 1 dimensional slicing, but can
# in a 2-d case be passd (slice(None),....)
if len(key) == 2:
if not _is_null_slice(key[0]):
raise AssertionError("invalid slicing for a 1-ndim categorical")
key = key[1]
elif len(key) == 1:
key = key[0]
else:
raise AssertionError("invalid slicing for a 1-ndim categorical")
# we can safely assign codes directly
self._codes[key] = value.codes

else:
key = self._codes[key]

lindexer = self.levels.get_indexer(rvalue)
self._codes[key] = lindexer
value = np.asarray(value)
flat_value = value.ravel()
to_add = Index(flat_value) - self.levels
if len(to_add):
raise ValueError("cannot setitem on a Categorical with a new level,"
" set the levels first")
lindexer = self.levels.get_indexer(flat_value)
self._codes[key] = lindexer.reshape(value.shape)

#### reduction ops ####
def _reduce(self, op, axis=0, skipna=True, numeric_only=None,
Expand Down Expand Up @@ -871,9 +872,8 @@ def mode(self):

import pandas.hashtable as htable
good = self._codes != -1
result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))),
levels=self.levels,ordered=self.ordered, name=self.name,
fastpath=True)
result = self._replace_codes(
sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))))
return result

def unique(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1649,18 +1649,6 @@ def _validate_merge(self, blocks):

return True

def to_native_types(self, slicer=None, na_rep='', **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
if slicer is not None:
# Categorical is always one dimension
values = values[slicer]
values = np.array(values, dtype=object)
mask = isnull(values)
values[mask] = na_rep
# Blocks.to_native_type returns list of lists, but we are always only a list
return [values.tolist()]

class DatetimeBlock(Block):
__slots__ = ()
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ def test_mixed(self):
self.assert_numpy_array_equal(labels, np.array([ 2, 2, -1, 3, 0, 1],dtype=np.int64))
self.assert_numpy_array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object))

def test_multidimensional(self):
labels, uniques = algos.factorize([['a', 'b'], ['a', 'c']])
self.assert_numpy_array_equal(labels, np.array([[0, 1], [0, 2]], dtype=np.int64))
self.assert_numpy_array_equal(uniques, np.array(['a', 'b', 'c'], dtype=object))

def test_datelike(self):

# M8
Expand Down
23 changes: 22 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@ def test_print(self):
self.assertEqual(actual, expected)

def test_big_print(self):
factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True)
factor = Categorical(np.array([0,1,2,0,1,2]*100), ['a', 'b', 'c'],
name='cat', fastpath=True)
expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c",
" a", " b", " c", " a", "...", " c", " a", " b", " c",
" a", " b", " c", " a", " b", " c", " a", " b", " c",
Expand Down Expand Up @@ -496,6 +497,26 @@ def test_slicing_directly(self):
self.assert_numpy_array_equal(sliced._codes, expected._codes)
tm.assert_index_equal(sliced.levels, expected.levels)

def test_ndimensional_values(self):
exp_arr = np.array([['a', 'b'], ['c', 'b']], dtype=object)
cat = Categorical(exp_arr)

self.assertEqual(cat.shape, (2, 2))
self.assert_numpy_array_equal(cat.__array__(), exp_arr)
self.assert_numpy_array_equal(cat.T, exp_arr.T)
self.assert_numpy_array_equal(cat.ravel(), exp_arr.ravel())

# test indexing
self.assertEqual(cat[0, 0], 'a')
self.assert_numpy_array_equal(cat[0], exp_arr[0])
self.assert_numpy_array_equal(cat[:, :2], exp_arr)
self.assert_numpy_array_equal(cat[[0, 1], [0, 1]], np.diag(exp_arr))
self.assert_numpy_array_equal(cat[0, :], ['a', 'b'])
self.assert_numpy_array_equal(cat[0, [0, 1]], ['a', 'b'])

# TODO: repr, __setitem__, take, min, max, order, describe, _cat_compare_op


class TestCategoricalAsBlock(tm.TestCase):
_multiprocess_can_split_ = True

Expand Down