Skip to content

BUG: various categorical fixes (GH8626) #8652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 28, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions doc/source/whatsnew/v0.15.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,41 @@ Experimental
Bug Fixes
~~~~~~~~~


- Bug in coercing ``Categorical` to a records array, e.g. ``df.to_records()`` (:issue:`8626)
- Bug in ``Categorical`` not created properly with ``Series.to_frame()`` (:issue:`8626`)
- Bug in coercing in astype of a ``Categorical`` of a passed ``pd.Categorical`` (this now raises ``TypeError`` correctly), (:issue:`8626`)
- Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)









- Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`)







- Bug in ix/loc block splitting on setitem (manifests with integer-like dtypes, e.g. datetime64) (:issue:`8607`)














- Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)
2 changes: 1 addition & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ class Categorical(PandasObject):

# For comparisons, so that numpy uses our implementation if the compare ops, which raise
__array_priority__ = 1000
_typ = 'categorical'
ordered = False
name = None

Expand Down Expand Up @@ -1464,4 +1465,3 @@ def _convert_to_list_like(list_like):
else:
# is this reached?
return [list_like]

6 changes: 5 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ class AmbiguousIndexError(PandasError, KeyError):
def create_pandas_abc_type(name, attr, comp):
@classmethod
def _check(cls, inst):
return getattr(inst, attr, None) in comp
result = getattr(inst, attr, None)
if result is None:
return False
return result in comp
dct = dict(__instancecheck__=_check,
__subclasscheck__=_check)
meta = type("ABCBase", (type,), dct)
Expand All @@ -78,6 +81,7 @@ def _check(cls, inst):
'sparse_time_series'))
ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
('sparse_array', 'sparse_series'))
ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical"))


class _ABCGeneric(type):
Expand Down
55 changes: 40 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _maybe_upcast, _is_sequence,
_infer_dtype_from_scalar, _values_from_object,
is_list_like, _get_dtype, _maybe_box_datetimelike)
is_list_like, _get_dtype, _maybe_box_datetimelike,
is_categorical_dtype)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (_maybe_droplevels,
Expand Down Expand Up @@ -332,6 +333,8 @@ def _init_dict(self, data, index, columns, dtype=None):

def _init_ndarray(self, values, index, columns, dtype=None,
copy=False):
# input must be a ndarray, list, Series, index

if isinstance(values, Series):
if columns is None:
if values.name is not None:
Expand All @@ -345,9 +348,41 @@ def _init_ndarray(self, values, index, columns, dtype=None,
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)

# helper to create the axes as indexes
def _get_axes(N, K, index=index, columns=columns):
# return axes or defaults

if index is None:
index = _default_index(N)
else:
index = _ensure_index(index)

if columns is None:
columns = _default_index(K)
else:
columns = _ensure_index(columns)
return index, columns

# we could have a categorical type passed or coerced to 'category'
# recast this to an _arrays_to_mgr
if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype):

if not hasattr(values,'dtype'):
values = _prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()

index, columns = _get_axes(len(values),1)
return _arrays_to_mgr([ values ], columns, index, columns,
dtype=dtype)

# by definition an array here
# the dtypes will be coerced to a single dtype
values = _prep_ndarray(values, copy=copy)

if dtype is not None:

if values.dtype != dtype:
try:
values = values.astype(dtype)
Expand All @@ -356,18 +391,7 @@ def _init_ndarray(self, values, index, columns, dtype=None,
% (dtype, orig))
raise_with_traceback(e)

N, K = values.shape

if index is None:
index = _default_index(N)
else:
index = _ensure_index(index)

if columns is None:
columns = _default_index(K)
else:
columns = _ensure_index(columns)

index, columns = _get_axes(*values.shape)
return create_block_manager_from_blocks([values.T], [columns, index])

@property
Expand Down Expand Up @@ -877,7 +901,7 @@ def to_records(self, index=True, convert_datetime64=True):
else:
ix_vals = [self.index.values]

arrays = ix_vals + [self[c].values for c in self.columns]
arrays = ix_vals + [self[c].get_values() for c in self.columns]

count = 0
index_names = list(self.index.names)
Expand All @@ -890,7 +914,7 @@ def to_records(self, index=True, convert_datetime64=True):
index_names = ['index']
names = index_names + lmap(str, self.columns)
else:
arrays = [self[c].values for c in self.columns]
arrays = [self[c].get_values() for c in self.columns]
names = lmap(str, self.columns)

dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
Expand Down Expand Up @@ -4729,6 +4753,7 @@ def convert(v):
values = convert(values)

else:

# drop subclass info, do not copy data
values = np.asarray(values)
if copy:
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,21 @@ def is_datelike(self):
""" return True if I am a non-datelike """
return self.is_datetime or self.is_timedelta

def is_categorical_astype(self, dtype):
"""
validate that we have a astypeable to categorical,
returns a boolean if we are a categorical
"""
if com.is_categorical_dtype(dtype):
if dtype == com.CategoricalDtype():
return True

# this is a pd.Categorical, but is not
# a valid type for astypeing
raise TypeError("invalid type {0} for astype".format(dtype))

return False

def to_dense(self):
return self.values.view()

Expand Down Expand Up @@ -345,7 +360,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

# may need to convert to categorical
# this is only called for non-categoricals
if com.is_categorical_dtype(dtype):
if self.is_categorical_astype(dtype):
return make_block(Categorical(self.values),
ndim=self.ndim,
placement=self.mgr_locs)
Expand Down Expand Up @@ -1682,7 +1697,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
raise on an except if raise == True
"""

if dtype == com.CategoricalDtype():
if self.is_categorical_astype(dtype):
values = self.values
else:
values = np.array(self.values).astype(dtype)
Expand Down
73 changes: 72 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,41 @@ def test_construction_series(self):
df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index)
tm.assert_frame_equal(df, expected)

def test_construction_frame(self):

# GH8626

# dict creation
df = DataFrame({ 'A' : list('abc') },dtype='category')
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(df['A'],expected)

# to_frame
s = Series(list('abc'),dtype='category')
result = s.to_frame()
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(result[0],expected)
result = s.to_frame(name='foo')
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(result['foo'],expected)

# list-like creation
df = DataFrame(list('abc'),dtype='category')
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(df[0],expected)

# these coerces back to object as its spread across columns

# ndim != 1
df = DataFrame([pd.Categorical(list('abc'))])
expected = DataFrame([list('abc')])
tm.assert_frame_equal(df,expected)

# mixed
df = DataFrame([pd.Categorical(list('abc')),list('def')])
expected = DataFrame([list('abc'),list('def')])
tm.assert_frame_equal(df,expected)

def test_reindex(self):

index = pd.date_range('20000101', periods=3)
Expand Down Expand Up @@ -2223,6 +2258,42 @@ def cmp(a,b):
# array conversion
tm.assert_almost_equal(np.array(s),np.array(s.values))

# valid conversion
for valid in [lambda x: x.astype('category'),
lambda x: x.astype(com.CategoricalDtype()),
lambda x: x.astype('object').astype('category'),
lambda x: x.astype('object').astype(com.CategoricalDtype())]:

result = valid(s)
tm.assert_series_equal(result,s)

# invalid conversion (these are NOT a dtype)
for invalid in [lambda x: x.astype(pd.Categorical),
lambda x: x.astype('object').astype(pd.Categorical)]:
self.assertRaises(TypeError, lambda : invalid(s))


def test_to_records(self):

# GH8626

# dict creation
df = DataFrame({ 'A' : list('abc') },dtype='category')
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(df['A'],expected)

# list-like creation
df = DataFrame(list('abc'),dtype='category')
expected = Series(list('abc'),dtype='category')
tm.assert_series_equal(df[0],expected)

# to record array
# this coerces
result = df.to_records()
expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
dtype=[('index', '<i8'), ('0', 'O')])
tm.assert_almost_equal(result,expected)

def test_numeric_like_ops(self):

# numeric ops should not succeed
Expand Down Expand Up @@ -2262,7 +2333,7 @@ def get_dir(s):

def test_pickle_v0_14_1(self):
cat = pd.Categorical(values=['a', 'b', 'c'],
levels=['a', 'b', 'c', 'd'],
categories=['a', 'b', 'c', 'd'],
name='foobar', ordered=False)
pickle_path = os.path.join(tm.get_data_path(),
'categorical_0_14_1.pickle')
Expand Down