Skip to content

API: map() on Index returns an Index, not array #14506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
71 changes: 70 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,77 @@ Other enhancements
Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. _whatsnew_0200.api:

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs a ref tag as well
.. _whatsnew.api_breaking.index_map

.. _whatsnew.api_breaking.index_map

Map on Index types now return other Index types
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

- ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can skip this example (the CategoricalIndex) and below

.. ipython:: python

idx = Index([1, 2])
idx
mi = MultiIndex.from_tuples([(1, 2), (2, 4)])
mi

Previous Behavior:

.. code-block:: ipython

In [5]: idx.map(lambda x: x * 2)
Out[5]: array([2, 4])

In [6]: idx.map(lambda x: (x, x * 2))
Out[6]: array([(1, 2), (2, 4)], dtype=object)

In [7]: mi.map(lambda x: x)
Out[7]: array([(1, 2), (2, 4)], dtype=object)

In [8]: mi.map(lambda x: x[0])
Out[8]: array([1, 2])

New Behavior:

.. ipython:: python

idx.map(lambda x: x * 2)

idx.map(lambda x: (x, x * 2))

mi.map(lambda x: x)

mi.map(lambda x: x[0])


- ``map`` on a Series with datetime64 values may return int64 dtypes rather than int32

.. ipython:: python

s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo'))
s

Previous Behavior:

.. code-block:: ipython

In [9]: s.map(lambda x: x.hour)
Out[9]:
0 0
1 1
2 2
dtype: int32


New Behavior:

.. ipython:: python

s.map(lambda x: x.hour)


.. _whatsnew_0200.api:

- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,8 +930,7 @@ def remove_unused_categories(self, inplace=False):
return cat

def map(self, mapper):
"""
Apply mapper function to its categories (not codes).
"""Apply mapper function to its categories (not codes).

Parameters
----------
Expand All @@ -943,7 +942,8 @@ def map(self, mapper):

Returns
-------
applied : Categorical or np.ndarray.
applied : Categorical or Index.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually this is ok

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


"""
new_categories = self.categories.map(mapper)
try:
Expand Down
19 changes: 15 additions & 4 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2427,8 +2427,7 @@ def groupby(self, values):
return result

def map(self, mapper):
"""
Apply mapper function to its values.
"""Apply mapper function to an index.

Parameters
----------
Expand All @@ -2437,9 +2436,21 @@ def map(self, mapper):

Returns
-------
applied : array
applied : Union[Index, MultiIndex], inferred
The output of the mapping function applied to the index.
If the function returns a tuple with more than one element
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

say the output Index type will be inferred

a MultiIndex will be returned.

"""
return self._arrmap(self.values, mapper)
from .multi import MultiIndex
mapped_values = self._arrmap(self.values, mapper)
attributes = self._get_attributes_dict()
if mapped_values.size and isinstance(mapped_values[0], tuple):
return MultiIndex.from_tuples(mapped_values,
names=attributes.get('name'))

attributes['copy'] = False
return Index(mapped_values, **attributes)

def isin(self, values, level=None):
"""
Expand Down
14 changes: 7 additions & 7 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,22 +517,22 @@ def take(self, indices, axis=0, allow_fill=True,
return self._create_from_codes(taken)

def map(self, mapper):
"""
Apply mapper function to its categories (not codes).
"""Apply mapper function to its categories (not codes).

Parameters
----------
mapper : callable
Function to be applied. When all categories are mapped
to different categories, the result will be Categorical which has
the same order property as the original. Otherwise, the result will
be np.ndarray.
to different categories, the result will be a CategoricalIndex
which has the same order property as the original. Otherwise,
the result will be a Index.

Returns
-------
applied : Categorical or np.ndarray.
applied : CategoricalIndex or Index

"""
return self.values.map(mapper)
return self._shallow_copy_with_infer(self.values.map(mapper))

def delete(self, loc):
"""
Expand Down
62 changes: 52 additions & 10 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,48 @@ def test_sub(self):
self.assertRaises(TypeError, lambda: idx - idx.tolist())
self.assertRaises(TypeError, lambda: idx.tolist() - idx)

def test_map_identity_mapping(self):
# GH 12766
for name, cur_index in self.indices.items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment with the appropriate github issue numbers (don't go crazy but where appropriate)

tm.assert_index_equal(cur_index, cur_index.map(lambda x: x))

def test_map_with_tuples(self):
# GH 12766

# Test that returning a single tuple from an Index
# returns an Index.
boolean_index = tm.makeIntIndex(3).map(lambda x: (x,))
expected = Index([(0,), (1,), (2,)])
tm.assert_index_equal(boolean_index, expected)

# Test that returning a tuple from a map of a single index
# returns a MultiIndex object.
boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1))
expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)])
tm.assert_index_equal(boolean_index, expected)

# Test that returning a single object from a MultiIndex
# returns an Index.
first_level = ['foo', 'bar', 'baz']
multi_index = MultiIndex.from_tuples(lzip(first_level, [1, 2, 3]))
reduced_index = multi_index.map(lambda x: x[0])
tm.assert_index_equal(reduced_index, Index(first_level))

def test_map_tseries_indices_return_index(self):
date_index = tm.makeDateIndex(10)
exp = Index([1] * 10)
tm.assert_index_equal(exp, date_index.map(lambda x: 1))

period_index = tm.makePeriodIndex(10)
tm.assert_index_equal(exp, period_index.map(lambda x: 1))

tdelta_index = tm.makeTimedeltaIndex(10)
tm.assert_index_equal(exp, tdelta_index.map(lambda x: 1))

date_index = tm.makeDateIndex(24, freq='h', name='hourly')
exp = Index(range(24), name='hourly')
tm.assert_index_equal(exp, date_index.map(lambda x: x.hour))

def test_append_multiple(self):
index = Index(['a', 'b', 'c', 'd', 'e', 'f'])

Expand Down Expand Up @@ -1194,16 +1236,16 @@ def check_slice(in_slice, expected):
self.assert_index_equal(result, expected)

for in_slice, expected in [
(SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''),
(SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'),
(SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'),
(SLC['y'::-4], 'yb'),
# absent labels
(SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'),
(SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'),
(SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'),
(SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''),
(SLC['m':'m':-1], '')
(SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''),
(SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'),
(SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'),
(SLC['y'::-4], 'yb'),
# absent labels
(SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'),
(SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'),
(SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'),
(SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''),
(SLC['m':'m':-1], '')
]:
check_slice(in_slice, expected)

Expand Down
23 changes: 12 additions & 11 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,19 +207,20 @@ def test_map(self):
ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
ordered=True)
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('cba'),
ordered=True)
tm.assert_categorical_equal(result, exp)
exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

show an example like this in the whatsnew as well (e.g. CategoryIndex.map -> CI rather than Category now)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jreback. Just wanted to touchbase here since I added this example and then you asked me to remove in another comment below. Did I add it in the wrong place or did you just decide it was a little overkill? Thanks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's too long for the what's new; so need to pare it down

Copy link
Contributor Author

@nateyoder nateyoder Dec 16, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯 sounds good. I shortened it up like you suggested. Let me know if you'd like any other changes.

ordered=True)
tm.assert_index_equal(result, exp)

ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
ordered=False, name='XXX')
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('bac'),
ordered=False)
tm.assert_categorical_equal(result, exp)
exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'),
ordered=False, name='XXX')
tm.assert_index_equal(result, exp)

tm.assert_numpy_array_equal(ci.map(lambda x: 1),
np.array([1] * 5, dtype=np.int64))
# GH 12766: Return an index not an array
tm.assert_index_equal(ci.map(lambda x: 1),
Index(np.array([1] * 5, dtype=np.int64), name='XXX'))

# change categories dtype
ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
Expand All @@ -228,9 +229,9 @@ def f(x):
return {'A': 10, 'B': 20, 'C': 30}.get(x)

result = ci.map(f)
exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30],
ordered=False)
tm.assert_categorical_equal(result, exp)
exp = pd.CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30],
ordered=False)
tm.assert_index_equal(result, exp)

def test_where(self):
i = self.create_index()
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/series/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,9 @@ def test_apply_datetimetz(self):
tm.assert_series_equal(result, exp)

# change dtype
# GH 14506 : Returned dtype changed from int32 to int64
result = s.apply(lambda x: x.hour)
exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32)
exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I don't think this should have changed, these are normally int32s

any idea? (also this might not be the same on windows)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's true it previously gave int32, but is there any reason for that? We almost always use int64 as the default integer size, and also currently Timestamp.hour gives you back int64 when you replace the map with an explicit loop:

In [20]: dtidx = pd.date_range(start='2012-01-01', periods=4)

In [29]: dtidx.map(lambda x: x.hour)
Out[29]: array([0, 0, 0, 0], dtype=int32)

In [30]: np.array([x.hour for x in dtidx])
Out[30]: array([0, 0, 0, 0])

In [31]: np.array([x.hour for x in dtidx]).dtype
Out[31]: dtype('int64')

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably the result of using asobject below

In [37]: dtidx.map(lambda x: x.hour)
Out[37]: array([0, 0, 0, 0], dtype=int32)

In [36]: dtidx.asobject.map(lambda x: x.hour)
Out[36]: array([0, 0, 0, 0], dtype=int64)
``

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is actually an implementation detail (as they r stored as int32)

we can change but that should be separate PR

ideally just like to preserve here

tm.assert_series_equal(result, exp)

# not vectorized
Expand Down Expand Up @@ -317,8 +318,9 @@ def test_map_datetimetz(self):
tm.assert_series_equal(result, exp)

# change dtype
# GH 14506 : Returned dtype changed from int32 to int64
result = s.map(lambda x: x.hour)
exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32)
exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
tm.assert_series_equal(result, exp)

with tm.assertRaises(NotImplementedError):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,7 +1669,8 @@ def test_map(self):
tm.assert_categorical_equal(result, exp)

result = c.map(lambda x: 1)
tm.assert_numpy_array_equal(result, np.array([1] * 5, dtype=np.int64))
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))


class TestCategoricalAsBlock(tm.TestCase):
Expand Down
12 changes: 8 additions & 4 deletions pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from pandas.util.decorators import Appender, cache_readonly
import pandas.types.concat as _concat
import pandas.tseries.frequencies as frequencies
import pandas.algos as _algos


class DatelikeOps(object):
Expand Down Expand Up @@ -330,11 +329,16 @@ def _nat_new(self, box=True):
def map(self, f):
try:
result = f(self)
if not isinstance(result, (np.ndarray, Index)):
raise TypeError

# Try to use this result if we can
if isinstance(result, np.ndarray):
self._shallow_copy(result)

if not isinstance(result, Index):
raise TypeError('The map function must return an Index object')
return result
except Exception:
return _algos.arrmap_object(self.asobject.values, f)
return self.asobject.map(f)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is better, using .values can change dtypes (esp timezones)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might actually close another issue...


def sort_values(self, return_indexer=False, ascending=True):
"""
Expand Down
6 changes: 3 additions & 3 deletions pandas/tseries/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import nose

import numpy as np
from pandas import Timestamp, Period
from pandas import Timestamp, Period, Index
from pandas.compat import u
import pandas.util.testing as tm
from pandas.tseries.offsets import Second, Milli, Micro
Expand Down Expand Up @@ -104,8 +104,8 @@ def test_dateindex_conversion(self):
for freq in ('B', 'L', 'S'):
dateindex = tm.makeDateIndex(k=10, freq=freq)
rs = self.dtc.convert(dateindex, None, None)
xp = converter.dates.date2num(dateindex._mpl_repr())
tm.assert_almost_equal(rs, xp, decimals)
xp = Index(converter.dates.date2num(dateindex._mpl_repr()))
tm.assert_index_equal(rs, xp, decimals)

def test_resolution(self):
def _assert_less(ts1, ts2):
Expand Down
15 changes: 6 additions & 9 deletions pandas/tseries/tests/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -3521,8 +3521,8 @@ def test_map(self):
tm.assert_index_equal(result, expected)

result = index.map(lambda x: x.ordinal)
exp = np.array([x.ordinal for x in index], dtype=np.int64)
tm.assert_numpy_array_equal(result, exp)
exp = Index([x.ordinal for x in index])
tm.assert_index_equal(result, exp)

def test_map_with_string_constructor(self):
raw = [2005, 2007, 2009]
Expand All @@ -3534,20 +3534,17 @@ def test_map_with_string_constructor(self):
types += text_type,

for t in types:
expected = np.array(lmap(t, raw), dtype=object)
expected = Index(lmap(t, raw))
res = index.map(t)

# should return an array
tm.assertIsInstance(res, np.ndarray)
# should return an Index
tm.assertIsInstance(res, Index)

# preserve element types
self.assertTrue(all(isinstance(resi, t) for resi in res))

# dtype should be object
self.assertEqual(res.dtype, np.dtype('object').type)

# lastly, values should compare equal
tm.assert_numpy_array_equal(res, expected)
tm.assert_index_equal(res, expected)

def test_convert_array_of_periods(self):
rng = period_range('1/1/2000', periods=20, freq='D')
Expand Down
Loading