From 47c0695ba3025d1538f964ba6e85868560964aa7 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sun, 14 Jun 2015 18:43:23 -0400 Subject: [PATCH] closes bug in apply when function returns categorical --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/internals.py | 3 ++ pandas/src/reduce.pyx | 54 +++++++++++++-------------------- pandas/tests/test_frame.py | 7 +++++ 4 files changed, 32 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 164ab73def894..4a513f3122390 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -58,3 +58,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7d83e45098ae1..4c4d940f8077c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1670,6 +1670,9 @@ def is_view(self): def to_dense(self): return self.values.to_dense().view() + def convert(self, copy=True, **kwargs): + return [self.copy() if copy else self] + @property def shape(self): return (len(self.mgr_locs), len(self.values)) diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index add9a03642bed..09f8e0ab42924 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -6,6 +6,18 @@ from distutils.version import LooseVersion is_numpy_prior_1_6_2 = LooseVersion(np.__version__) < '1.6.2' +cdef _get_result_array(object obj, + Py_ssize_t size, + Py_ssize_t cnt): + + if isinstance(obj, np.ndarray) \ + or isinstance(obj, list) and len(obj) == cnt \ + or getattr(obj, 'shape', None) == (cnt,): + raise ValueError('function does not reduce') + + return np.empty(size, dtype='O') + + cdef class Reducer: ''' Performs generic reduction operation on a C or Fortran-contiguous ndarray @@ -124,7 +136,9 @@ cdef class Reducer: if hasattr(res,'values'): res = res.values if i == 0: - result = self._get_result_array(res) + result = _get_result_array(res, + self.nresults, + len(self.dummy)) it = PyArray_IterNew(result) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) @@ -143,17 +157,6 @@ cdef class Reducer: return result - def _get_result_array(self, object res): - try: - assert(not isinstance(res, np.ndarray)) - assert(not (isinstance(res, list) and len(res) == len(self.dummy))) - - result = np.empty(self.nresults, dtype='O') - result[0] = res - except Exception: - raise ValueError('function does not reduce') - return result - cdef class SeriesBinGrouper: ''' @@ -257,8 +260,10 @@ cdef class SeriesBinGrouper: res = self.f(cached_typ) res = _extract_result(res) if not initialized: - result = self._get_result_array(res) initialized = 1 + result = _get_result_array(res, + self.ngroups, + len(self.dummy_arr)) util.assign_value_1d(result, i, res) @@ -277,16 +282,6 @@ cdef class SeriesBinGrouper: return result, counts - def _get_result_array(self, object res): - try: - assert(not isinstance(res, np.ndarray)) - assert(not (isinstance(res, list) and len(res) == len(self.dummy_arr))) - - result = np.empty(self.ngroups, dtype='O') - except Exception: - raise ValueError('function does not reduce') - return result - cdef class SeriesGrouper: ''' @@ -388,8 +383,10 @@ cdef class SeriesGrouper: res = self.f(cached_typ) res = _extract_result(res) if not initialized: - result = self._get_result_array(res) initialized = 1 + result = _get_result_array(res, + self.ngroups, + len(self.dummy_arr)) util.assign_value_1d(result, lab, res) counts[lab] = group_size @@ -410,15 +407,6 @@ cdef class SeriesGrouper: return result, counts - def _get_result_array(self, object res): - try: - assert(not isinstance(res, np.ndarray)) - assert(not (isinstance(res, list) and len(res) == len(self.dummy_arr))) - - result = np.empty(self.ngroups, dtype='O') - except Exception: - raise ValueError('function does not reduce') - return result cdef inline _extract_result(object res): ''' extract the result object, it might be a 0-dim ndarray diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4b1954a3be64e..a4abe481cfe81 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10382,6 +10382,13 @@ def test_apply(self): [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) self.assertRaises(ValueError, df.apply, lambda x: x, 2) + # GH9573 + df = DataFrame({'c0':['A','A','B','B'], 'c1':['C','C','D','D']}) + df = df.apply(lambda ts: ts.astype('category')) + self.assertEqual(df.shape, (4, 2)) + self.assertTrue(isinstance(df['c0'].dtype, com.CategoricalDtype)) + self.assertTrue(isinstance(df['c1'].dtype, com.CategoricalDtype)) + def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778