From 3f76205f1d4a42ba971ff5be8770b52172b9f822 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 5 Sep 2013 15:21:51 -0400 Subject: [PATCH 1/2] BUG: Bug with reindexing where a non-unique index will now raise ValueError (GH4746) --- doc/source/release.rst | 1 + pandas/core/frame.py | 7 +++--- pandas/core/generic.py | 7 +++--- pandas/core/indexing.py | 2 +- pandas/core/internals.py | 7 +++++- pandas/tests/test_frame.py | 44 +++++++++++++++++++++++--------------- 6 files changed, 42 insertions(+), 26 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d75e9d2cee52e..eae38a75483e4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -326,6 +326,7 @@ See :ref:`Internal Refactoring` - Bug with using ``QUOTE_NONE`` with ``to_csv`` causing ``Exception``. (:issue:`4328`) - Bug with Series indexing not raising an error when the right-hand-side has an incorrect length (:issue:`2702`) - Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`) + - Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`) pandas 0.12 =========== diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c6e7697f8ea1..7f79b4b6b3d6a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2267,7 +2267,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=NA, limit=limit, copy_if_needed=True, takeable=takeable) return self._reindex_with_indexers({0: [new_index, indexer]}, - copy=copy, fill_value=fill_value) + copy=copy, fill_value=fill_value, allow_dups=takeable) def _reindex_columns(self, new_columns, copy, level, fill_value=NA, limit=None, takeable=False): @@ -2275,7 +2275,7 @@ def _reindex_columns(self, new_columns, copy, level, fill_value=NA, limit=limit, copy_if_needed=True, takeable=takeable) return self._reindex_with_indexers({1: [new_columns, indexer]}, - copy=copy, fill_value=fill_value) + copy=copy, fill_value=fill_value, allow_dups=takeable) def _reindex_multi(self, axes, copy, fill_value): """ we are guaranteed non-Nones in the axes! """ @@ -2541,8 +2541,7 @@ def take(self, indices, axis=0, convert=True): new_data = self._data.take(indices, axis=1, verify=False) return DataFrame(new_data) else: - new_columns = self.columns.take(indices) - return self.reindex(columns=new_columns) + return self.reindex(columns=indices, takeable=True) else: new_values = com.take_nd(self.values, com._ensure_int64(indices), diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 796c3948a2681..3bebb2bc3660f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -944,7 +944,7 @@ def drop(self, labels, axis=0, level=None): new_axis = axis.drop(labels, level=level) else: new_axis = axis.drop(labels) - dropped = self.reindex(**{axis_name: new_axis}) + dropped = self.reindex(**{ axis_name: new_axis }) try: dropped.axes[axis_].set_names(axis.names, inplace=True) except AttributeError: @@ -1161,7 +1161,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, method=method, fill_value=fill_value, limit=limit, copy=copy)._propogate_attributes(self) - def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False): + def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False, allow_dups=False): + """ allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indiciated new_data = self._data @@ -1183,7 +1184,7 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, lim # TODO: speed up on homogeneous DataFrame objects indexer = com._ensure_int64(indexer) new_data = new_data.reindex_indexer(index, indexer, axis=baxis, - fill_value=fill_value) + fill_value=fill_value, allow_dups=allow_dups) elif baxis == 0 and index is not None and index is not new_data.axes[baxis]: new_data = new_data.reindex_items(index, copy=copy, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9ecdf1930604f..19eeecfeb2bde 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -668,7 +668,7 @@ def _reindex(keys, level=None): if axis+1 > ndim: raise AssertionError("invalid indexing error with non-unique index") - result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True) + result = result._reindex_with_indexers({ axis : [ new_labels, new_indexer ] }, copy=True, allow_dups=True) return result diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e0ee99455d238..6a5757dd7488d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -198,6 +198,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None raise AssertionError('axis must be at least 1, got %d' % axis) if fill_value is None: fill_value = self.fill_value + new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) return make_block( @@ -2718,10 +2719,14 @@ def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_va raise AssertionError('method argument not supported for ' 'axis == 0') - def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None): + def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, allow_dups=False): """ pandas-indexer with -1's only. """ + # trying to reindex on an axis with duplicates + if not allow_dups and not self.axes[axis].is_unique: + raise ValueError("cannot reindex from a duplicate axis") + if axis == 0: return self._reindex_indexer_items(new_axis, indexer, fill_value) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b4ec36ac5f29e..bb76547da0c28 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2879,7 +2879,7 @@ def test_constructor_column_duplicates(self): columns=['b', 'a', 'a']) - def test_column_duplicates_operations(self): + def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: @@ -2973,22 +2973,6 @@ def check(result, expected=None): expected = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','hello','foo2']) check(df,expected) - # reindex - df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) - expected = DataFrame([[1],[1],[1]],columns=['bar']) - result = df.reindex(columns=['bar']) - check(result,expected) - - result1 = DataFrame([[1],[1],[1]],columns=['bar']).reindex(columns=['bar','foo']) - result2 = df.reindex(columns=['bar','foo']) - check(result2,result1) - - # drop - df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) - df = df.drop(['a'],axis=1) - expected = DataFrame([[1],[1],[1]],columns=['bar']) - check(df,expected) - # values df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x']) result = df.values @@ -3016,6 +3000,17 @@ def check(result, expected=None): columns=['RT','TClose','TExg','RPT_Date','STK_ID','STK_Name','QT_Close']).set_index(['STK_ID','RPT_Date'],drop=False) assert_frame_equal(result,expected) + # reindex is invalid! + df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) + self.assertRaises(ValueError, df.reindex, columns=['bar']) + self.assertRaises(ValueError, df.reindex, columns=['bar','foo']) + + # drop + df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) + df = df.drop(['a'],axis=1) + expected = DataFrame([[1],[1],[1]],columns=['bar']) + check(df,expected) + def test_insert_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 @@ -7573,6 +7568,21 @@ def test_reindex_fill_value(self): expected = df.reindex(lrange(15)).fillna(0) assert_frame_equal(result, expected) + def test_reindex_dups(self): + + # GH4746, reindex on duplicate index error messages + arr = np.random.randn(10) + df = DataFrame(arr,index=[1,2,3,4,5,1,2,3,4,5]) + + # set index is ok + result = df.copy() + result.index = list(range(len(df))) + expected = DataFrame(arr,index=list(range(len(df)))) + assert_frame_equal(result,expected) + + # reindex fails + self.assertRaises(ValueError, df.reindex, index=list(range(len(df)))) + def test_align(self): af, bf = self.frame.align(self.frame) From 407e904a6ba5f97323a554d690a2e888f4106dbb Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 5 Sep 2013 21:05:01 -0400 Subject: [PATCH 2/2] CLN: refactored out take from core/frame.py to use core/generic.py version --- doc/source/release.rst | 4 +-- doc/source/v0.13.0.txt | 2 +- pandas/core/frame.py | 42 --------------------------- pandas/core/generic.py | 7 +++-- pandas/core/internals.py | 46 +++++++++++++++++++++++++----- pandas/sparse/tests/test_sparse.py | 4 ++- 6 files changed, 49 insertions(+), 56 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index eae38a75483e4..9a34cdbdfb5a8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -151,6 +151,7 @@ pandas 0.13 - ``Series.isin()`` and ``DataFrame.isin()`` now raise a ``TypeError`` when passed a string (:issue:`4763`). Pass a ``list`` of one element (containing the string) instead. + - Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`) **Internal Refactoring** @@ -172,7 +173,7 @@ See :ref:`Internal Refactoring` - ``_indexed_same,reindex_like,align,where,mask`` - ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``) - ``filter`` (also added axis argument to selectively filter on a different axis) - - ``reindex,reindex_axis`` (which was the biggest change to make generic) + - ``reindex,reindex_axis,take`` - ``truncate`` (moved to become part of ``NDFrame``) - These are API changes which make ``Panel`` more consistent with ``DataFrame`` @@ -224,7 +225,6 @@ See :ref:`Internal Refactoring` - Refactor of ``_get_numeric_data/_get_bool_data`` to core/generic.py, allowing Series/Panel functionaility - Refactor of Series arithmetic with time-like objects (datetime/timedelta/time etc.) into a separate, cleaned up wrapper class. (:issue:`4613`) -- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, and ``ExcelFile``. (:issue:`4713`, :issue:`4712`) **Experimental Features** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index d1decc164484d..6b8b7e73f3ac4 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -297,7 +297,7 @@ and behaviors. Series formerly subclassed directly from ``ndarray``. (:issue:`40 - ``_indexed_same,reindex_like,align,where,mask`` - ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``) - ``filter`` (also added axis argument to selectively filter on a different axis) - - ``reindex,reindex_axis`` (which was the biggest change to make generic) + - ``reindex,reindex_axis,take`` - ``truncate`` (moved to become part of ``NDFrame``) - These are API changes which make ``Panel`` more consistent with ``DataFrame`` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f79b4b6b3d6a..a3eb3ea54c784 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2513,48 +2513,6 @@ def _maybe_cast(values, labels=None): delevel = deprecate('delevel', reset_index) - def take(self, indices, axis=0, convert=True): - """ - Analogous to ndarray.take, return DataFrame corresponding to requested - indices along an axis - - Parameters - ---------- - indices : list / array of ints - axis : {0, 1} - convert : convert indices for negative values, check bounds, default True - mainly useful for an user routine calling - - Returns - ------- - taken : DataFrame - """ - - # check/convert indicies here - if convert: - axis = self._get_axis_number(axis) - indices = _maybe_convert_indices( - indices, len(self._get_axis(axis))) - - if self._is_mixed_type: - if axis == 0: - new_data = self._data.take(indices, axis=1, verify=False) - return DataFrame(new_data) - else: - return self.reindex(columns=indices, takeable=True) - else: - new_values = com.take_nd(self.values, - com._ensure_int64(indices), - axis=axis) - if axis == 0: - new_columns = self.columns - new_index = self.index.take(indices) - else: - new_columns = self.columns.take(indices) - new_index = self.index - return self._constructor(new_values, index=new_index, - columns=new_columns) - #---------------------------------------------------------------------- # Reindex-based selection methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bebb2bc3660f..7f5b9b7f75545 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -862,12 +862,13 @@ def take(self, indices, axis=0, convert=True): indices = _maybe_convert_indices( indices, len(self._get_axis(axis))) - if axis == 0: + baxis = self._get_block_manager_axis(axis) + if baxis == 0: labels = self._get_axis(axis) new_items = labels.take(indices) - new_data = self._data.reindex_axis(new_items, axis=0) + new_data = self._data.reindex_axis(new_items, indexer=indices, axis=0) else: - new_data = self._data.take(indices, axis=axis, verify=False) + new_data = self._data.take(indices, axis=baxis) return self._constructor(new_data) def select(self, crit, axis=0): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6a5757dd7488d..91fdc712fb9b8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1516,7 +1516,20 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu if indexer is None: indexer = np.arange(len(self.items)) - new_values = com.take_1d(self.values.values, indexer) + # single block + if self.ndim == 1: + + new_items = new_ref_items + new_values = com.take_1d(self.values.values, indexer) + + else: + + # if we don't overlap at all, then don't include this block + new_items = self.items & new_ref_items + if not len(new_items): + return None + + new_values = self.values.values # fill if needed if method is not None or limit is not None: @@ -1524,7 +1537,7 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, fill_valu fill_value = self.fill_value new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value) - return self.make_block(new_values, items=new_ref_items, ref_items=new_ref_items, copy=copy) + return self.make_block(new_values, items=new_items, ref_items=new_ref_items, copy=copy) def sparse_reindex(self, new_index): """ sparse reindex and return a new block @@ -2794,15 +2807,34 @@ def reindex_items(self, new_items, indexer=None, copy=True, fill_value=None): if indexer is None: for blk in self.blocks: if copy: - new_blocks.append(blk.reindex_items_from(new_items)) + blk = blk.reindex_items_from(new_items) else: blk.ref_items = new_items + if blk is not None: new_blocks.append(blk) else: - for block in self.blocks: - newb = block.reindex_items_from(new_items, copy=copy) - if len(newb.items) > 0: - new_blocks.append(newb) + + # unique + if self.axes[0].is_unique: + for block in self.blocks: + + newb = block.reindex_items_from(new_items, copy=copy) + if newb is not None and len(newb.items) > 0: + new_blocks.append(newb) + + # non-unique + else: + rl = self._set_ref_locs() + for i, idx in enumerate(indexer): + blk, lidx = rl[idx] + item = new_items.take([i]) + blk = make_block(_block_shape(blk.iget(lidx)), + item, + new_items, + ndim=self.ndim, + fastpath=True, + placement = [i]) + new_blocks.append(blk) # add a na block if we are missing items mask = indexer == -1 diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 91f2fe319957b..e0a9d2b937225 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -384,7 +384,9 @@ def test_getitem_slice(self): idx = self.bseries.index res = self.bseries[::2] tm.assert_isinstance(res, SparseSeries) - assert_sp_series_equal(res, self.bseries.reindex(idx[::2])) + + expected = self.bseries.reindex(idx[::2]) + assert_sp_series_equal(res, expected) res = self.bseries[:5] tm.assert_isinstance(res, SparseSeries)