diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 89da897f6c529..472663733e3b6 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -190,19 +190,19 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in .. _whatsnew_0210.api_breaking.iteration_scalars: -Iteration of Series/Index will now return python scalars +Iteration of Series/Index will now return Python scalars ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). +Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a Python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affects int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). .. ipython:: python - s = Series([1, 2, 3]) + s = pd.Series([1, 2, 3]) s Previously: -.. code-block:: python +.. code-block:: ipython In [2]: type(list(s)[0]) Out[2]: numpy.int64 @@ -215,14 +215,14 @@ New Behaviour: Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. -.. ipython:: python +.. ipython:: ipython d = {'a':[1], 'b':['b']} - df = DataFrame(d) + df = pd,DataFrame(d) Previously: -.. code-block:: python +.. code-block:: ipython In [8]: type(df.to_dict()['a'][0]) Out[8]: numpy.int64 diff --git a/pandas/core/base.py b/pandas/core/base.py index 62d89eac4b354..f0e8d8a16661b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -892,18 +892,31 @@ def argmin(self, axis=None): def tolist(self): """ - return a list of the values; box to scalars + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + See Also + -------- + numpy.tolist """ - return list(self.__iter__()) + + if is_datetimelike(self): + return [_maybe_box_datetimelike(x) for x in self._values] + else: + return self._values.tolist() def __iter__(self): """ - provide iteration over the values; box to scalars + Return an iterator of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) """ - if is_datetimelike(self): - return (_maybe_box_datetimelike(x) for x in self._values) - else: - return iter(self._values.tolist()) + return iter(self.tolist()) @cache_readonly def hasnans(self): diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index dbd2a79b7e46d..97df72900428c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -26,7 +26,7 @@ is_integer_dtype, is_bool, is_list_like, is_sequence, is_scalar) -from pandas.core.common import is_null_slice +from pandas.core.common import is_null_slice, _maybe_box_datetimelike from pandas.core.algorithms import factorize, take_1d, unique1d from pandas.core.base import (PandasObject, PandasDelegate, @@ -401,8 +401,14 @@ def itemsize(self): def tolist(self): """ - return a list of my values + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) """ + if is_datetimelike(self.categories): + return [_maybe_box_datetimelike(x) for x in self] return np.array(self).tolist() def reshape(self, new_shape, *args, **kwargs): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c8044b14e4e57..e952f4f4da1cb 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -253,9 +253,8 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() - def __iter__(self): - """ iterate like Categorical """ - return self._data.__iter__() + def tolist(self): + return self._data.tolist() @property def codes(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 05d31af57b36c..aac68ebd6abed 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -576,12 +576,13 @@ def test_isin(self): ci.isin(['c', 'a', 'b', np.nan]), np.array([True] * 6)) # mismatched categorical -> coerced to ndarray so doesn't matter - tm.assert_numpy_array_equal( - ci.isin(ci.set_categories(list('abcdefghi'))), np.array([True] * - 6)) - tm.assert_numpy_array_equal( - ci.isin(ci.set_categories(list('defghi'))), - np.array([False] * 5 + [True])) + result = ci.isin(ci.set_categories(list('abcdefghi'))) + expected = np.array([True] * 6) + tm.assert_numpy_array_equal(result, expected) + + result = ci.isin(ci.set_categories(list('defghi'))) + expected = np.array([False] * 5 + [True]) + tm.assert_numpy_array_equal(result, expected) def test_identical(self): diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b7fbe803f8d3b..d0805e2bb54d2 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -245,43 +245,6 @@ def test_iter(self): for i, val in enumerate(self.ts): assert val == self.ts[i] - def test_iter_box(self): - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] - s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timestamp) - assert res.tz is None - assert res == exp - - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] - s = pd.Series(vals) - - assert s.dtype == 'datetime64[ns, US/Eastern]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timestamp) - assert res.tz == exp.tz - assert res == exp - - # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] - s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timedelta) - assert res == exp - - # period (object dtype, not boxed) - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] - s = pd.Series(vals) - assert s.dtype == 'object' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Period) - assert res.freq == 'M' - assert res == exp - def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 210d0260b8d95..38d78b12b31aa 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1054,10 +1054,7 @@ class TestToIterable(object): ('timedelta64[ns]', Timedelta)] @pytest.mark.parametrize( - 'dtype, rdtype', - dtypes + [ - ('object', object), - ('category', object)]) + 'dtype, rdtype', dtypes) @pytest.mark.parametrize( 'method', [ @@ -1074,6 +1071,43 @@ def test_iterable(self, typ, method, dtype, rdtype): result = method(s)[0] assert isinstance(result, rdtype) + @pytest.mark.parametrize( + 'dtype, rdtype, obj', + [ + ('object', object, 'a'), + ('object', (int, long), 1), + ('category', object, 'a'), + ('category', (int, long), 1)]) + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable_object_and_category(self, typ, method, + dtype, rdtype, obj): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + s = typ([obj], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + 'dtype, rdtype', dtypes) + def test_iterable_items(self, dtype, rdtype): + # gh-13258 + # test items / iteritems yields the correct boxed scalars + # this only applies to series + s = Series([1], dtype=dtype) + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + _, result = list(s.iteritems())[0] + assert isinstance(result, rdtype) + @pytest.mark.parametrize( 'dtype, rdtype', dtypes + [ @@ -1102,3 +1136,40 @@ def test_categorial_datetimelike(self, method): result = method(i)[0] assert isinstance(result, Timestamp) + + def test_iter_box(self): + vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + s = pd.Series(vals) + assert s.dtype == 'datetime64[ns]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timestamp) + assert res.tz is None + assert res == exp + + vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')] + s = pd.Series(vals) + + assert s.dtype == 'datetime64[ns, US/Eastern]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timestamp) + assert res.tz == exp.tz + assert res == exp + + # timedelta + vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + s = pd.Series(vals) + assert s.dtype == 'timedelta64[ns]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timedelta) + assert res == exp + + # period (object dtype, not boxed) + vals = [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')] + s = pd.Series(vals) + assert s.dtype == 'object' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Period) + assert res.freq == 'M' + assert res == exp