diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index ed080f7f11863..59d96d0ca1c71 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -48,7 +48,41 @@ Experimental Bug Fixes ~~~~~~~~~ + +- Bug in coercing ``Categorical` to a records array, e.g. ``df.to_records()`` (:issue:`8626) +- Bug in ``Categorical`` not created properly with ``Series.to_frame()`` (:issue:`8626`) +- Bug in coercing in astype of a ``Categorical`` of a passed ``pd.Categorical`` (this now raises ``TypeError`` correctly), (:issue:`8626`) - Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`) + + + + + + + + + - Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`) + + + + + + + - Bug in ix/loc block splitting on setitem (manifests with integer-like dtypes, e.g. datetime64) (:issue:`8607`) + + + + + + + + + + + + + + - Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b35cfdcf7c8f1..e0d2eaa8a6e0c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -187,6 +187,7 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 + _typ = 'categorical' ordered = False name = None @@ -1464,4 +1465,3 @@ def _convert_to_list_like(list_like): else: # is this reached? return [list_like] - diff --git a/pandas/core/common.py b/pandas/core/common.py index 31dc58d1870e0..2839b54b7d71a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -56,7 +56,10 @@ class AmbiguousIndexError(PandasError, KeyError): def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): - return getattr(inst, attr, None) in comp + result = getattr(inst, attr, None) + if result is None: + return False + return result in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) meta = type("ABCBase", (type,), dct) @@ -78,6 +81,7 @@ def _check(cls, inst): 'sparse_time_series')) ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series')) +ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical")) class _ABCGeneric(type): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d90ef76ddfa5e..e2c53be1d0cd4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,7 +26,8 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, _infer_dtype_from_scalar, _values_from_object, - is_list_like, _get_dtype, _maybe_box_datetimelike) + is_list_like, _get_dtype, _maybe_box_datetimelike, + is_categorical_dtype) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_droplevels, @@ -332,6 +333,8 @@ def _init_dict(self, data, index, columns, dtype=None): def _init_ndarray(self, values, index, columns, dtype=None, copy=False): + # input must be a ndarray, list, Series, index + if isinstance(values, Series): if columns is None: if values.name is not None: @@ -345,9 +348,41 @@ def _init_ndarray(self, values, index, columns, dtype=None, if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) + # helper to create the axes as indexes + def _get_axes(N, K, index=index, columns=columns): + # return axes or defaults + + if index is None: + index = _default_index(N) + else: + index = _ensure_index(index) + + if columns is None: + columns = _default_index(K) + else: + columns = _ensure_index(columns) + return index, columns + + # we could have a categorical type passed or coerced to 'category' + # recast this to an _arrays_to_mgr + if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype): + + if not hasattr(values,'dtype'): + values = _prep_ndarray(values, copy=copy) + values = values.ravel() + elif copy: + values = values.copy() + + index, columns = _get_axes(len(values),1) + return _arrays_to_mgr([ values ], columns, index, columns, + dtype=dtype) + + # by definition an array here + # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None: + if values.dtype != dtype: try: values = values.astype(dtype) @@ -356,18 +391,7 @@ def _init_ndarray(self, values, index, columns, dtype=None, % (dtype, orig)) raise_with_traceback(e) - N, K = values.shape - - if index is None: - index = _default_index(N) - else: - index = _ensure_index(index) - - if columns is None: - columns = _default_index(K) - else: - columns = _ensure_index(columns) - + index, columns = _get_axes(*values.shape) return create_block_manager_from_blocks([values.T], [columns, index]) @property @@ -877,7 +901,7 @@ def to_records(self, index=True, convert_datetime64=True): else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c].values for c in self.columns] + arrays = ix_vals + [self[c].get_values() for c in self.columns] count = 0 index_names = list(self.index.names) @@ -890,7 +914,7 @@ def to_records(self, index=True, convert_datetime64=True): index_names = ['index'] names = index_names + lmap(str, self.columns) else: - arrays = [self[c].values for c in self.columns] + arrays = [self[c].get_values() for c in self.columns] names = lmap(str, self.columns) dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)]) @@ -4729,6 +4753,7 @@ def convert(v): values = convert(values) else: + # drop subclass info, do not copy data values = np.asarray(values) if copy: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9be680d998216..89e1cd6ce0fb6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -92,6 +92,21 @@ def is_datelike(self): """ return True if I am a non-datelike """ return self.is_datetime or self.is_timedelta + def is_categorical_astype(self, dtype): + """ + validate that we have a astypeable to categorical, + returns a boolean if we are a categorical + """ + if com.is_categorical_dtype(dtype): + if dtype == com.CategoricalDtype(): + return True + + # this is a pd.Categorical, but is not + # a valid type for astypeing + raise TypeError("invalid type {0} for astype".format(dtype)) + + return False + def to_dense(self): return self.values.view() @@ -345,7 +360,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, # may need to convert to categorical # this is only called for non-categoricals - if com.is_categorical_dtype(dtype): + if self.is_categorical_astype(dtype): return make_block(Categorical(self.values), ndim=self.ndim, placement=self.mgr_locs) @@ -1682,7 +1697,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, raise on an except if raise == True """ - if dtype == com.CategoricalDtype(): + if self.is_categorical_astype(dtype): values = self.values else: values = np.array(self.values).astype(dtype) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 03c73232f13bb..e47d8aaa52c9b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1072,6 +1072,41 @@ def test_construction_series(self): df = DataFrame({'x': Series(['a', 'b', 'c'],dtype='category')}, index=index) tm.assert_frame_equal(df, expected) + def test_construction_frame(self): + + # GH8626 + + # dict creation + df = DataFrame({ 'A' : list('abc') },dtype='category') + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(df['A'],expected) + + # to_frame + s = Series(list('abc'),dtype='category') + result = s.to_frame() + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(result[0],expected) + result = s.to_frame(name='foo') + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(result['foo'],expected) + + # list-like creation + df = DataFrame(list('abc'),dtype='category') + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(df[0],expected) + + # these coerces back to object as its spread across columns + + # ndim != 1 + df = DataFrame([pd.Categorical(list('abc'))]) + expected = DataFrame([list('abc')]) + tm.assert_frame_equal(df,expected) + + # mixed + df = DataFrame([pd.Categorical(list('abc')),list('def')]) + expected = DataFrame([list('abc'),list('def')]) + tm.assert_frame_equal(df,expected) + def test_reindex(self): index = pd.date_range('20000101', periods=3) @@ -2223,6 +2258,42 @@ def cmp(a,b): # array conversion tm.assert_almost_equal(np.array(s),np.array(s.values)) + # valid conversion + for valid in [lambda x: x.astype('category'), + lambda x: x.astype(com.CategoricalDtype()), + lambda x: x.astype('object').astype('category'), + lambda x: x.astype('object').astype(com.CategoricalDtype())]: + + result = valid(s) + tm.assert_series_equal(result,s) + + # invalid conversion (these are NOT a dtype) + for invalid in [lambda x: x.astype(pd.Categorical), + lambda x: x.astype('object').astype(pd.Categorical)]: + self.assertRaises(TypeError, lambda : invalid(s)) + + + def test_to_records(self): + + # GH8626 + + # dict creation + df = DataFrame({ 'A' : list('abc') },dtype='category') + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(df['A'],expected) + + # list-like creation + df = DataFrame(list('abc'),dtype='category') + expected = Series(list('abc'),dtype='category') + tm.assert_series_equal(df[0],expected) + + # to record array + # this coerces + result = df.to_records() + expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')], + dtype=[('index', '