diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2855cde95ac2a..e264fb15f3e67 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -91,8 +91,77 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0200.api: +.. _whatsnew.api_breaking.index_map + +Map on Index types now return other Index types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`) + + .. ipython:: python + + idx = Index([1, 2]) + idx + mi = MultiIndex.from_tuples([(1, 2), (2, 4)]) + mi + + Previous Behavior: + + .. code-block:: ipython + + In [5]: idx.map(lambda x: x * 2) + Out[5]: array([2, 4]) + + In [6]: idx.map(lambda x: (x, x * 2)) + Out[6]: array([(1, 2), (2, 4)], dtype=object) + + In [7]: mi.map(lambda x: x) + Out[7]: array([(1, 2), (2, 4)], dtype=object) + + In [8]: mi.map(lambda x: x[0]) + Out[8]: array([1, 2]) + + New Behavior: + + .. ipython:: python + + idx.map(lambda x: x * 2) + + idx.map(lambda x: (x, x * 2)) + + mi.map(lambda x: x) + + mi.map(lambda x: x[0]) + + +- ``map`` on a Series with datetime64 values may return int64 dtypes rather than int32 + + .. ipython:: python + + s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo')) + s + + Previous Behavior: + + .. code-block:: ipython + + In [9]: s.map(lambda x: x.hour) + Out[9]: + 0 0 + 1 1 + 2 2 + dtype: int32 + + + New Behavior: + + .. ipython:: python + + s.map(lambda x: x.hour) + + + .. _whatsnew_0200.api: - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 922fb84684729..5124dc44e2fc8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -930,8 +930,7 @@ def remove_unused_categories(self, inplace=False): return cat def map(self, mapper): - """ - Apply mapper function to its categories (not codes). + """Apply mapper function to its categories (not codes). Parameters ---------- @@ -943,7 +942,8 @@ def map(self, mapper): Returns ------- - applied : Categorical or np.ndarray. + applied : Categorical or Index. + """ new_categories = self.categories.map(mapper) try: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 512abfd88c78c..1cc546629589d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2427,8 +2427,7 @@ def groupby(self, values): return result def map(self, mapper): - """ - Apply mapper function to its values. + """Apply mapper function to an index. Parameters ---------- @@ -2437,9 +2436,21 @@ def map(self, mapper): Returns ------- - applied : array + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - return self._arrmap(self.values, mapper) + from .multi import MultiIndex + mapped_values = self._arrmap(self.values, mapper) + attributes = self._get_attributes_dict() + if mapped_values.size and isinstance(mapped_values[0], tuple): + return MultiIndex.from_tuples(mapped_values, + names=attributes.get('name')) + + attributes['copy'] = False + return Index(mapped_values, **attributes) def isin(self, values, level=None): """ diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index c1f5d47e1e04f..2c89f72975ade 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -517,22 +517,22 @@ def take(self, indices, axis=0, allow_fill=True, return self._create_from_codes(taken) def map(self, mapper): - """ - Apply mapper function to its categories (not codes). + """Apply mapper function to its categories (not codes). Parameters ---------- mapper : callable Function to be applied. When all categories are mapped - to different categories, the result will be Categorical which has - the same order property as the original. Otherwise, the result will - be np.ndarray. + to different categories, the result will be a CategoricalIndex + which has the same order property as the original. Otherwise, + the result will be a Index. Returns ------- - applied : Categorical or np.ndarray. + applied : CategoricalIndex or Index + """ - return self.values.map(mapper) + return self._shallow_copy_with_infer(self.values.map(mapper)) def delete(self, loc): """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9be4935716989..3536a52432b8c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -767,6 +767,48 @@ def test_sub(self): self.assertRaises(TypeError, lambda: idx - idx.tolist()) self.assertRaises(TypeError, lambda: idx.tolist() - idx) + def test_map_identity_mapping(self): + # GH 12766 + for name, cur_index in self.indices.items(): + tm.assert_index_equal(cur_index, cur_index.map(lambda x: x)) + + def test_map_with_tuples(self): + # GH 12766 + + # Test that returning a single tuple from an Index + # returns an Index. + boolean_index = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(0,), (1,), (2,)]) + tm.assert_index_equal(boolean_index, expected) + + # Test that returning a tuple from a map of a single index + # returns a MultiIndex object. + boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)]) + tm.assert_index_equal(boolean_index, expected) + + # Test that returning a single object from a MultiIndex + # returns an Index. + first_level = ['foo', 'bar', 'baz'] + multi_index = MultiIndex.from_tuples(lzip(first_level, [1, 2, 3])) + reduced_index = multi_index.map(lambda x: x[0]) + tm.assert_index_equal(reduced_index, Index(first_level)) + + def test_map_tseries_indices_return_index(self): + date_index = tm.makeDateIndex(10) + exp = Index([1] * 10) + tm.assert_index_equal(exp, date_index.map(lambda x: 1)) + + period_index = tm.makePeriodIndex(10) + tm.assert_index_equal(exp, period_index.map(lambda x: 1)) + + tdelta_index = tm.makeTimedeltaIndex(10) + tm.assert_index_equal(exp, tdelta_index.map(lambda x: 1)) + + date_index = tm.makeDateIndex(24, freq='h', name='hourly') + exp = Index(range(24), name='hourly') + tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) @@ -1194,16 +1236,16 @@ def check_slice(in_slice, expected): self.assert_index_equal(result, expected) for in_slice, expected in [ - (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''), - (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'), - (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'), - (SLC['y'::-4], 'yb'), - # absent labels - (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'), - (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'), - (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'), - (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''), - (SLC['m':'m':-1], '') + (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''), + (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'), + (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'), + (SLC['y'::-4], 'yb'), + # absent labels + (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'), + (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'), + (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'), + (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''), + (SLC['m':'m':-1], '') ]: check_slice(in_slice, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 819b88bf4c5d3..708f424d9bad1 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -207,19 +207,20 @@ def test_map(self): ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), ordered=True) result = ci.map(lambda x: x.lower()) - exp = pd.Categorical(list('ababc'), categories=list('cba'), - ordered=True) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'), + ordered=True) + tm.assert_index_equal(result, exp) ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), ordered=False, name='XXX') result = ci.map(lambda x: x.lower()) - exp = pd.Categorical(list('ababc'), categories=list('bac'), - ordered=False) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'), + ordered=False, name='XXX') + tm.assert_index_equal(result, exp) - tm.assert_numpy_array_equal(ci.map(lambda x: 1), - np.array([1] * 5, dtype=np.int64)) + # GH 12766: Return an index not an array + tm.assert_index_equal(ci.map(lambda x: 1), + Index(np.array([1] * 5, dtype=np.int64), name='XXX')) # change categories dtype ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), @@ -228,9 +229,9 @@ def f(x): return {'A': 10, 'B': 20, 'C': 30}.get(x) result = ci.map(f) - exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], - ordered=False) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], + ordered=False) + tm.assert_index_equal(result, exp) def test_where(self): i = self.create_index() diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 8d7676bef4d72..ec7ffde344d31 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -123,8 +123,9 @@ def test_apply_datetimetz(self): tm.assert_series_equal(result, exp) # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 result = s.apply(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) tm.assert_series_equal(result, exp) # not vectorized @@ -317,8 +318,9 @@ def test_map_datetimetz(self): tm.assert_series_equal(result, exp) # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) tm.assert_series_equal(result, exp) with tm.assertRaises(NotImplementedError): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5320b2216ee40..5d2c317cc0f81 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1669,7 +1669,8 @@ def test_map(self): tm.assert_categorical_equal(result, exp) result = c.map(lambda x: 1) - tm.assert_numpy_array_equal(result, np.array([1] * 5, dtype=np.int64)) + # GH 12766: Return an index not an array + tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) class TestCategoricalAsBlock(tm.TestCase): diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4645ae24684ff..8e24f430108b3 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -27,7 +27,6 @@ from pandas.util.decorators import Appender, cache_readonly import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies -import pandas.algos as _algos class DatelikeOps(object): @@ -330,11 +329,16 @@ def _nat_new(self, box=True): def map(self, f): try: result = f(self) - if not isinstance(result, (np.ndarray, Index)): - raise TypeError + + # Try to use this result if we can + if isinstance(result, np.ndarray): + self._shallow_copy(result) + + if not isinstance(result, Index): + raise TypeError('The map function must return an Index object') return result except Exception: - return _algos.arrmap_object(self.asobject.values, f) + return self.asobject.map(f) def sort_values(self, return_indexer=False, ascending=True): """ diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 37d9c35639c32..7e4ed288e31c1 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,7 +3,7 @@ import nose import numpy as np -from pandas import Timestamp, Period +from pandas import Timestamp, Period, Index from pandas.compat import u import pandas.util.testing as tm from pandas.tseries.offsets import Second, Milli, Micro @@ -104,8 +104,8 @@ def test_dateindex_conversion(self): for freq in ('B', 'L', 'S'): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) - xp = converter.dates.date2num(dateindex._mpl_repr()) - tm.assert_almost_equal(rs, xp, decimals) + xp = Index(converter.dates.date2num(dateindex._mpl_repr())) + tm.assert_index_equal(rs, xp, decimals) def test_resolution(self): def _assert_less(ts1, ts2): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index fe0d28dd9c508..dae1554c0930e 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3521,8 +3521,8 @@ def test_map(self): tm.assert_index_equal(result, expected) result = index.map(lambda x: x.ordinal) - exp = np.array([x.ordinal for x in index], dtype=np.int64) - tm.assert_numpy_array_equal(result, exp) + exp = Index([x.ordinal for x in index]) + tm.assert_index_equal(result, exp) def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] @@ -3534,20 +3534,17 @@ def test_map_with_string_constructor(self): types += text_type, for t in types: - expected = np.array(lmap(t, raw), dtype=object) + expected = Index(lmap(t, raw)) res = index.map(t) - # should return an array - tm.assertIsInstance(res, np.ndarray) + # should return an Index + tm.assertIsInstance(res, Index) # preserve element types self.assertTrue(all(isinstance(resi, t) for resi in res)) - # dtype should be object - self.assertEqual(res.dtype, np.dtype('object').type) - # lastly, values should compare equal - tm.assert_numpy_array_equal(res, expected) + tm.assert_index_equal(res, expected) def test_convert_array_of_periods(self): rng = period_range('1/1/2000', periods=20, freq='D') diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index f0d14014d6559..ca957ca0394d1 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1513,8 +1513,8 @@ def test_map(self): f = lambda x: x.days result = rng.map(f) - exp = np.array([f(x) for x in rng], dtype=np.int64) - self.assert_numpy_array_equal(result, exp) + exp = Int64Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) def test_misc_coverage(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index beacc21912edc..cd22ac561c6f7 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3003,8 +3003,8 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) - exp = np.array([f(x) for x in rng], dtype='=U8') - tm.assert_almost_equal(result, exp) + exp = Index([f(x) for x in rng], dtype='