API/ERR: allow iterators in df.set_index & improve errors (pandas-dev#24984)

h-vetinari · Pingviinituutti · commit a97556691bf9 · 2019-02-28T10:26:56.000+02:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -22,6 +22,7 @@ Other Enhancements
 - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
 - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
 - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
+- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
 - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
 -
 
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -137,6 +137,7 @@ def lfilter(*args, **kwargs):
     reload = reload
     Hashable = collections.abc.Hashable
     Iterable = collections.abc.Iterable
+    Iterator = collections.abc.Iterator
     Mapping = collections.abc.Mapping
     MutableMapping = collections.abc.MutableMapping
     Sequence = collections.abc.Sequence
@@ -199,6 +200,7 @@ def get_range_parameters(data):
 
     Hashable = collections.Hashable
     Iterable = collections.Iterable
+    Iterator = collections.Iterator
     Mapping = collections.Mapping
     MutableMapping = collections.MutableMapping
     Sequence = collections.Sequence
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -33,7 +33,7 @@
 
 from pandas import compat
 from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
-                           PY36, raise_with_traceback,
+                           PY36, raise_with_traceback, Iterator,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
 from pandas.core.dtypes.cast import (
@@ -4025,7 +4025,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             This parameter can be either a single column key, a single array of
             the same length as the calling DataFrame, or a list containing an
             arbitrary combination of column keys and arrays. Here, "array"
-            encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
+            encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
+            instances of :class:`abc.Iterator`.
         drop : bool, default True
             Delete columns to be used as the new index.
         append : bool, default False
@@ -4104,6 +4105,32 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         if not isinstance(keys, list):
             keys = [keys]
 
+        err_msg = ('The parameter "keys" may be a column key, one-dimensional '
+                   'array, or a list containing only valid column keys and '
+                   'one-dimensional arrays.')
+
+        missing = []
+        for col in keys:
+            if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray,
+                                list, Iterator)):
+                # arrays are fine as long as they are one-dimensional
+                # iterators get converted to list below
+                if getattr(col, 'ndim', 1) != 1:
+                    raise ValueError(err_msg)
+            else:
+                # everything else gets tried as a key; see GH 24969
+                try:
+                    found = col in self.columns
+                except TypeError:
+                    raise TypeError(err_msg + ' Received column of '
+                                    'type {}'.format(type(col)))
+                else:
+                    if not found:
+                        missing.append(col)
+
+        if missing:
+            raise KeyError('None of {} are in the columns'.format(missing))
+
         if inplace:
             frame = self
         else:
@@ -4132,13 +4159,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             elif isinstance(col, (list, np.ndarray)):
                 arrays.append(col)
                 names.append(None)
+            elif isinstance(col, Iterator):
+                arrays.append(list(col))
+                names.append(None)
             # from here, col can only be a column label
             else:
                 arrays.append(frame[col]._values)
                 names.append(col)
                 if drop:
                     to_remove.append(col)
 
+            if len(arrays[-1]) != len(self):
+                # check newest element against length of calling frame, since
+                # ensure_index_from_sequences would not raise for append=False.
+                raise ValueError('Length mismatch: Expected {len_self} rows, '
+                                 'received array of length {len_col}'.format(
+                                     len_self=len(self),
+                                     len_col=len(arrays[-1])
+                                 ))
+
         index = ensure_index_from_sequences(arrays, names)
 
         if verify_integrity and not index.is_unique:
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -178,10 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
     # MultiIndex constructor does not work directly on Series -> lambda
     # We also emulate a "constructor" for the label -> lambda
     # also test index name if append=True (name is duplicate here for A)
-    @pytest.mark.parametrize('box2', [Series, Index, np.array, list,
+    @pytest.mark.parametrize('box2', [Series, Index, np.array, list, iter,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
-    @pytest.mark.parametrize('box1', [Series, Index, np.array, list,
+    @pytest.mark.parametrize('box1', [Series, Index, np.array, list, iter,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
     @pytest.mark.parametrize('append, index_name', [(True, None),
@@ -195,6 +195,9 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
         keys = [box1(df['A']), box2(df['A'])]
         result = df.set_index(keys, drop=drop, append=append)
 
+        # if either box is iter, it has been consumed; re-read
+        keys = [box1(df['A']), box2(df['A'])]
+
         # need to adapt first drop for case that both keys are 'A' --
         # cannot drop the same column twice;
         # use "is" because == would give ambiguous Boolean error for containers
@@ -253,25 +256,48 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
             df.set_index(['A', df['A'], tuple(df['A'])],
                          drop=drop, append=append)
 
-    @pytest.mark.xfail(reason='broken due to revert, see GH 25085')
     @pytest.mark.parametrize('append', [True, False])
     @pytest.mark.parametrize('drop', [True, False])
-    @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)],
-                             ids=['set', 'iter', 'generator'])
+    @pytest.mark.parametrize('box', [set], ids=['set'])
     def test_set_index_raise_on_type(self, frame_of_index_cols, box,
                                      drop, append):
         df = frame_of_index_cols
 
         msg = 'The parameter "keys" may be a column key, .*'
-        # forbidden type, e.g. set/iter/generator
+        # forbidden type, e.g. set
         with pytest.raises(TypeError, match=msg):
             df.set_index(box(df['A']), drop=drop, append=append)
 
-        # forbidden type in list, e.g. set/iter/generator
+        # forbidden type in list, e.g. set
         with pytest.raises(TypeError, match=msg):
             df.set_index(['A', df['A'], box(df['A'])],
                          drop=drop, append=append)
 
+    # MultiIndex constructor does not work directly on Series -> lambda
+    @pytest.mark.parametrize('box', [Series, Index, np.array, iter,
+                                     lambda x: MultiIndex.from_arrays([x])],
+                             ids=['Series', 'Index', 'np.array',
+                                  'iter', 'MultiIndex'])
+    @pytest.mark.parametrize('length', [4, 6], ids=['too_short', 'too_long'])
+    @pytest.mark.parametrize('append', [True, False])
+    @pytest.mark.parametrize('drop', [True, False])
+    def test_set_index_raise_on_len(self, frame_of_index_cols, box, length,
+                                    drop, append):
+        # GH 24984
+        df = frame_of_index_cols  # has length 5
+
+        values = np.random.randint(0, 10, (length,))
+
+        msg = 'Length mismatch: Expected 5 rows, received array of length.*'
+
+        # wrong length directly
+        with pytest.raises(ValueError, match=msg):
+            df.set_index(box(values), drop=drop, append=append)
+
+        # wrong length in list
+        with pytest.raises(ValueError, match=msg):
+            df.set_index(['A', df.A, box(values)], drop=drop, append=append)
+
     def test_set_index_custom_label_type(self):
         # GH 24969
 
@@ -341,7 +367,7 @@ def __repr__(self):
 
         # missing key
         thing3 = Thing(['Three', 'pink'])
-        msg = '.*'  # due to revert, see GH 25085
+        msg = r"frozenset\(\{'Three', 'pink'\}\)"
         with pytest.raises(KeyError, match=msg):
             # missing label directly
             df.set_index(thing3)
@@ -366,7 +392,7 @@ def __str__(self):
         thing2 = Thing('Two', 'blue')
         df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
 
-        msg = 'unhashable type.*'
+        msg = 'The parameter "keys" may be a column key, .*'
 
         with pytest.raises(TypeError, match=msg):
             # use custom label directly

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ Other Enhancements`
`22`	`22`	- Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
`23`	`23`	- :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
`24`	`24`	- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
	`25`	+- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
`25`	`26`	- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
`26`	`27`	`-`
`27`	`28`