Merge pull request #4757 from jreback/reindex_er

jreback · jreback · commit e5b196db0d12 · 2013-09-06T16:09:08.000-07:00
BUG: Bug with reindexing on the index with a non-unique index will now raise a ValueError
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -151,6 +151,7 @@ pandas 0.13
   - ``Series.isin()`` and ``DataFrame.isin()``  now raise a ``TypeError`` when
     passed a string (:issue:`4763`). Pass a ``list`` of one element (containing
     the string) instead.
+  - Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`)
 
 **Internal Refactoring**
 
@@ -172,7 +173,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
     - ``_indexed_same,reindex_like,align,where,mask``
     - ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``)
     - ``filter`` (also added axis argument to selectively filter on a different axis)
-    - ``reindex,reindex_axis`` (which was the biggest change to make generic)
+    - ``reindex,reindex_axis,take``
     - ``truncate`` (moved to become part of ``NDFrame``)
 
 - These are API changes which make ``Panel`` more consistent with ``DataFrame``
@@ -224,7 +225,6 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
 - Refactor of ``_get_numeric_data/_get_bool_data`` to core/generic.py, allowing Series/Panel functionaility
 - Refactor of Series arithmetic with time-like objects (datetime/timedelta/time
   etc.) into a separate, cleaned up wrapper class. (:issue:`4613`)
-- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`)
 
 **Experimental Features**
 
@@ -326,6 +326,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
   - Bug with using ``QUOTE_NONE`` with ``to_csv`` causing ``Exception``. (:issue:`4328`)
   - Bug with Series indexing not raising an error when the right-hand-side has an incorrect length (:issue:`2702`)
   - Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
+  - Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
 
 pandas 0.12
 ===========
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -297,7 +297,7 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40
     - ``_indexed_same,reindex_like,align,where,mask``
     - ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``)
     - ``filter`` (also added axis argument to selectively filter on a different axis)
-    - ``reindex,reindex_axis`` (which was the biggest change to make generic)
+    - ``reindex,reindex_axis,take``
     - ``truncate`` (moved to become part of ``NDFrame``)
 
 - These are API changes which make ``Panel`` more consistent with ``DataFrame``
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2267,15 +2267,15 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
                                                 limit=limit, copy_if_needed=True,
                                                 takeable=takeable)
         return self._reindex_with_indexers({0: [new_index, indexer]},
-                                           copy=copy, fill_value=fill_value)
+                                           copy=copy, fill_value=fill_value, allow_dups=takeable)
 
     def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
                          limit=None, takeable=False):
         new_columns, indexer = self.columns.reindex(new_columns, level=level,
                                                     limit=limit, copy_if_needed=True,
                                                     takeable=takeable)
         return self._reindex_with_indexers({1: [new_columns, indexer]},
-                                           copy=copy, fill_value=fill_value)
+                                           copy=copy, fill_value=fill_value, allow_dups=takeable)
 
     def _reindex_multi(self, axes, copy, fill_value):
         """ we are guaranteed non-Nones in the axes! """
@@ -2513,49 +2513,6 @@ def _maybe_cast(values, labels=None):
 
     delevel = deprecate('delevel', reset_index)
 
-    def take(self, indices, axis=0, convert=True):
-        """
-        Analogous to ndarray.take, return DataFrame corresponding to requested
-        indices along an axis
-
-        Parameters
-        ----------
-        indices : list / array of ints
-        axis : {0, 1}
-        convert : convert indices for negative values, check bounds, default True
-                  mainly useful for an user routine calling
-
-        Returns
-        -------
-        taken : DataFrame
-        """
-
-        # check/convert indicies here
-        if convert:
-            axis = self._get_axis_number(axis)
-            indices = _maybe_convert_indices(
-                indices, len(self._get_axis(axis)))
-
-        if self._is_mixed_type:
-            if axis == 0:
-                new_data = self._data.take(indices, axis=1, verify=False)
-                return DataFrame(new_data)
-            else:
-                new_columns = self.columns.take(indices)
-                return self.reindex(columns=new_columns)
-        else:
-            new_values = com.take_nd(self.values,
-                                     com._ensure_int64(indices),
-                                     axis=axis)
-            if axis == 0:
-                new_columns = self.columns
-                new_index = self.index.take(indices)
-            else:
-                new_columns = self.columns.take(indices)
-                new_index = self.index
-            return self._constructor(new_values, index=new_index,
-                                     columns=new_columns)
-
     #----------------------------------------------------------------------
     # Reindex-based selection methods
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -862,12 +862,13 @@ def take(self, indices, axis=0, convert=True):
             indices = _maybe_convert_indices(
                 indices, len(self._get_axis(axis)))
 
-        if axis == 0:
+        baxis = self._get_block_manager_axis(axis)
+        if baxis == 0:
             labels = self._get_axis(axis)
             new_items = labels.take(indices)
-            new_data = self._data.reindex_axis(new_items, axis=0)
+            new_data = self._data.reindex_axis(new_items, indexer=indices, axis=0)
         else:
-            new_data = self._data.take(indices, axis=axis, verify=False)
+            new_data = self._data.take(indices, axis=baxis)
         return self._constructor(new_data)
 
     def select(self, crit, axis=0):
@@ -944,7 +945,7 @@ def drop(self, labels, axis=0, level=None):
                 new_axis = axis.drop(labels, level=level)
             else:
                 new_axis = axis.drop(labels)
-            dropped = self.reindex(**{axis_name: new_axis})
+            dropped = self.reindex(**{ axis_name: new_axis })
             try:
                 dropped.axes[axis_].set_names(axis.names, inplace=True)
             except AttributeError:
@@ -1161,7 +1162,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
         return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value,
                                            limit=limit, copy=copy)._propogate_attributes(self)
 
-    def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False):
+    def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False, allow_dups=False):
+        """ allow_dups indicates an internal call here """
 
         # reindex doing multiple operations on different axes if indiciated
         new_data = self._data
@@ -1183,7 +1185,7 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, lim
                 # TODO: speed up on homogeneous DataFrame objects
                 indexer = com._ensure_int64(indexer)
                 new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
-                                                    fill_value=fill_value)
+                                                    fill_value=fill_value, allow_dups=allow_dups)
 
             elif baxis == 0 and index is not None and index is not new_data.axes[baxis]:
                 new_data = new_data.reindex_items(index, copy=copy,
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -668,7 +668,7 @@ def _reindex(keys, level=None):
                     if axis+1 > ndim:
                         raise AssertionError("invalid indexing error with non-unique index")
 
-                    result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True)
+                    result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True, allow_dups=True)
 
                 return result
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -198,6 +198,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None
             raise AssertionError('axis must be at least 1, got %d' % axis)
         if fill_value is None:
             fill_value = self.fill_value
+
         new_values = com.take_nd(self.values, indexer, axis,
                                  fill_value=fill_value, mask_info=mask_info)
         return make_block(
@@ -1515,15 +1516,28 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu
         if indexer is None:
             indexer = np.arange(len(self.items))
 
-        new_values = com.take_1d(self.values.values, indexer)
+        # single block
+        if self.ndim == 1:
+
+            new_items = new_ref_items
+            new_values = com.take_1d(self.values.values, indexer)
+
+        else:
+
+            # if we don't overlap at all, then don't include this block
+            new_items = self.items & new_ref_items
+            if not len(new_items):
+                return None
+
+            new_values = self.values.values
 
         # fill if needed
         if method is not None or limit is not None:
             if fill_value is None:
                 fill_value = self.fill_value
             new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value)
 
-        return self.make_block(new_values, items=new_ref_items, ref_items=new_ref_items, copy=copy)
+        return self.make_block(new_values, items=new_items, ref_items=new_ref_items, copy=copy)
 
     def sparse_reindex(self, new_index):
         """ sparse reindex and return a new block
@@ -2718,10 +2732,14 @@ def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_va
         raise AssertionError('method argument not supported for '
                              'axis == 0')
 
-    def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None):
+    def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, allow_dups=False):
         """
         pandas-indexer with -1's only.
         """
+        # trying to reindex on an axis with duplicates
+        if not allow_dups and not self.axes[axis].is_unique:
+            raise ValueError("cannot reindex from a duplicate axis")
+
         if axis == 0:
             return self._reindex_indexer_items(new_axis, indexer, fill_value)
 
@@ -2789,15 +2807,34 @@ def reindex_items(self, new_items, indexer=None, copy=True, fill_value=None):
         if indexer is None:
             for blk in self.blocks:
                 if copy:
-                    new_blocks.append(blk.reindex_items_from(new_items))
+                    blk = blk.reindex_items_from(new_items)
                 else:
                     blk.ref_items = new_items
+                if blk is not None:
                     new_blocks.append(blk)
         else:
-            for block in self.blocks:
-                newb = block.reindex_items_from(new_items, copy=copy)
-                if len(newb.items) > 0:
-                    new_blocks.append(newb)
+
+            # unique
+            if self.axes[0].is_unique:
+                for block in self.blocks:
+
+                    newb = block.reindex_items_from(new_items, copy=copy)
+                    if newb is not None and len(newb.items) > 0:
+                        new_blocks.append(newb)
+
+            # non-unique
+            else:
+                rl = self._set_ref_locs()
+                for i, idx in enumerate(indexer):
+                    blk, lidx = rl[idx]
+                    item = new_items.take([i])
+                    blk = make_block(_block_shape(blk.iget(lidx)),
+                                     item,
+                                     new_items,
+                                     ndim=self.ndim,
+                                     fastpath=True,
+                                     placement = [i])
+                    new_blocks.append(blk)
 
             # add a na block if we are missing items
             mask = indexer == -1
diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py
@@ -384,7 +384,9 @@ def test_getitem_slice(self):
         idx = self.bseries.index
         res = self.bseries[::2]
         tm.assert_isinstance(res, SparseSeries)
-        assert_sp_series_equal(res, self.bseries.reindex(idx[::2]))
+
+        expected = self.bseries.reindex(idx[::2])
+        assert_sp_series_equal(res, expected)
 
         res = self.bseries[:5]
         tm.assert_isinstance(res, SparseSeries)
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2879,7 +2879,7 @@ def test_constructor_column_duplicates(self):
                           columns=['b', 'a', 'a'])
 
 
-    def test_column_duplicates_operations(self):
+    def test_column_dups_operations(self):
 
         def check(result, expected=None):
             if expected is not None:
@@ -2973,22 +2973,6 @@ def check(result, expected=None):
         expected = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','hello','foo2'])
         check(df,expected)
 
-        # reindex
-        df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
-        expected = DataFrame([[1],[1],[1]],columns=['bar'])
-        result = df.reindex(columns=['bar'])
-        check(result,expected)
-
-        result1 = DataFrame([[1],[1],[1]],columns=['bar']).reindex(columns=['bar','foo'])
-        result2 = df.reindex(columns=['bar','foo'])
-        check(result2,result1)
-
-        # drop
-        df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
-        df = df.drop(['a'],axis=1)
-        expected = DataFrame([[1],[1],[1]],columns=['bar'])
-        check(df,expected)
-
         # values
         df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x'])
         result = df.values
@@ -3016,6 +3000,17 @@ def check(result, expected=None):
                              columns=['RT','TClose','TExg','RPT_Date','STK_ID','STK_Name','QT_Close']).set_index(['STK_ID','RPT_Date'],drop=False)
         assert_frame_equal(result,expected)
 
+        # reindex is invalid!
+        df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
+        self.assertRaises(ValueError, df.reindex, columns=['bar'])
+        self.assertRaises(ValueError, df.reindex, columns=['bar','foo'])
+
+        # drop
+        df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a'])
+        df = df.drop(['a'],axis=1)
+        expected = DataFrame([[1],[1],[1]],columns=['bar'])
+        check(df,expected)
+
     def test_insert_benchmark(self):
         # from the vb_suite/frame_methods/frame_insert_columns
         N = 10
@@ -7573,6 +7568,21 @@ def test_reindex_fill_value(self):
         expected = df.reindex(lrange(15)).fillna(0)
         assert_frame_equal(result, expected)
 
+    def test_reindex_dups(self):
+
+        # GH4746, reindex on duplicate index error messages
+        arr = np.random.randn(10)
+        df = DataFrame(arr,index=[1,2,3,4,5,1,2,3,4,5])
+
+        # set index is ok
+        result = df.copy()
+        result.index = list(range(len(df)))
+        expected = DataFrame(arr,index=list(range(len(df))))
+        assert_frame_equal(result,expected)
+
+        # reindex fails
+        self.assertRaises(ValueError, df.reindex, index=list(range(len(df))))
+
     def test_align(self):
 
         af, bf = self.frame.align(self.frame)