From 0890d070e341df9f5d54c95fca6ec0da168e9fb7 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 24 Jun 2013 20:11:35 -0400 Subject: [PATCH] BUG: GH4017, efficiently support non-unique indicies with iloc PERF: getting an indexer with a non_unique index, now MUCH faster PERF: vbench for loc/iloc with dups BUG: sparse reindex needed takeable arg TST BUG: correctly interpret tuple/list in non_unique indexers BUG: df.loc[idx] with out-of-bounds indexers not correctly interpreted PERF: df.loc with non-unique index not blazing fast! --- doc/source/release.rst | 2 ++ doc/source/v0.12.0.txt | 2 ++ pandas/core/frame.py | 21 +++++++++++++-------- pandas/core/index.py | 12 +++++++++--- pandas/core/indexing.py | 17 ++++++++++++++--- pandas/core/series.py | 8 +++++--- pandas/index.pyx | 33 ++++++++++++++++++++++----------- pandas/sparse/frame.py | 5 +++-- pandas/tests/test_indexing.py | 32 +++++++++++++++++++++++++++++++- vb_suite/indexing.py | 16 ++++++++++++++++ 10 files changed, 117 insertions(+), 31 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 917d91a14441e..817d029417aa1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -212,6 +212,8 @@ pandas 0.12 - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`) - ``DataFrame.itertuples()`` now works with frames with duplicate column names (:issue:`3873`) + - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to + ``reindex`` for location-based taking - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`) - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`) diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index e146e892722d8..ccdba4d765aff 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -410,6 +410,8 @@ Bug Fixes - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`) - ``DataFrame.itertuples()`` now works with frames with duplicate column names (:issue:`3873`) + - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to + ``reindex`` for location-based taking - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`) - ``read_html`` now correctly skips tests (:issue:`3741`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3a23a212c51e8..5f1ea00e421a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1979,7 +1979,9 @@ def _ixs(self, i, axis=0, copy=False): else: label = self.index[i] if isinstance(label, Index): - return self.reindex(label) + + # a location index by definition + return self.reindex(label, takeable=True) else: try: new_values = self._data.fast_2d_xs(i, copy=copy) @@ -2590,7 +2592,7 @@ def _align_series(self, other, join='outer', axis=None, level=None, return left_result, right_result def reindex(self, index=None, columns=None, method=None, level=None, - fill_value=NA, limit=None, copy=True): + fill_value=NA, limit=None, copy=True, takeable=False): """Conform DataFrame to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and @@ -2617,6 +2619,7 @@ def reindex(self, index=None, columns=None, method=None, level=None, "compatible" value limit : int, default None Maximum size gap to forward or backward fill + takeable : the labels are locations (and not labels) Examples -------- @@ -2636,11 +2639,11 @@ def reindex(self, index=None, columns=None, method=None, level=None, if columns is not None: frame = frame._reindex_columns(columns, copy, level, - fill_value, limit) + fill_value, limit, takeable) if index is not None: frame = frame._reindex_index(index, method, copy, level, - fill_value, limit) + fill_value, limit, takeable) return frame @@ -2717,16 +2720,18 @@ def _reindex_multi(self, new_index, new_columns, copy, fill_value): return self.copy() if copy else self def _reindex_index(self, new_index, method, copy, level, fill_value=NA, - limit=None): + limit=None, takeable=False): new_index, indexer = self.index.reindex(new_index, method, level, - limit=limit, copy_if_needed=True) + limit=limit, copy_if_needed=True, + takeable=takeable) return self._reindex_with_indexers(new_index, indexer, None, None, copy, fill_value) def _reindex_columns(self, new_columns, copy, level, fill_value=NA, - limit=None): + limit=None, takeable=False): new_columns, indexer = self.columns.reindex(new_columns, level=level, - limit=limit, copy_if_needed=True) + limit=limit, copy_if_needed=True, + takeable=takeable) return self._reindex_with_indexers(None, None, new_columns, indexer, copy, fill_value) diff --git a/pandas/core/index.py b/pandas/core/index.py index c06c46cde36c8..05cb360d12e16 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -920,7 +920,8 @@ def _get_method(self, method): } return aliases.get(method, method) - def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False): + def reindex(self, target, method=None, level=None, limit=None, + copy_if_needed=False, takeable=False): """ For Index, simply returns the new index and the results of get_indexer. Provided here to enable an interface that is amenable for @@ -953,7 +954,11 @@ def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=Fa if method is not None or limit is not None: raise ValueError("cannot reindex a non-unique index " "with a method or limit") - indexer, missing = self.get_indexer_non_unique(target) + if takeable: + indexer = target + missing = (target>=len(target)).nonzero()[0] + else: + indexer, missing = self.get_indexer_non_unique(target) return target, indexer @@ -2202,7 +2207,8 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) - def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False): + def reindex(self, target, method=None, level=None, limit=None, + copy_if_needed=False, takeable=False): """ Performs any necessary conversion on the input index and calls get_indexer. This method is here so MultiIndex and an Index of diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 33f72a0d15415..27c12fcd2e8eb 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -476,10 +476,21 @@ def _reindex(keys, level=None): cur_indexer = com._ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]),dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values + new_indexer[missing_indexer] = -1 - result = result.reindex_axis(new_labels,axis=axis) + # need to reindex with an indexer on a specific axis + from pandas.core.frame import DataFrame + if not (type(self.obj) == DataFrame): + raise NotImplementedError("cannot handle non-unique indexing for non-DataFrame (yet)") + + args = [None] * 4 + args[2*axis] = new_labels + args[2*axis+1] = new_indexer + + result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index bd76d91629121..9b11f7c7b0f66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -598,7 +598,7 @@ def _ixs(self, i, axis=0): else: label = self.index[i] if isinstance(label, Index): - return self.reindex(label) + return self.reindex(label, takeable=True) else: return _index.get_value_at(self, i) @@ -2618,7 +2618,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self._constructor(new_values, new_index, name=self.name) def reindex(self, index=None, method=None, level=None, fill_value=pa.NA, - limit=None, copy=True): + limit=None, copy=True, takeable=False): """Conform Series to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and @@ -2643,6 +2643,7 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA, "compatible" value limit : int, default None Maximum size gap to forward or backward fill + takeable : the labels are locations (and not labels) Returns ------- @@ -2664,7 +2665,8 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA, return Series(nan, index=index, name=self.name) new_index, indexer = self.index.reindex(index, method=method, - level=level, limit=limit) + level=level, limit=limit, + takeable=takeable) new_values = com.take_1d(self.values, indexer, fill_value=fill_value) return Series(new_values, index=new_index, name=self.name) diff --git a/pandas/index.pyx b/pandas/index.pyx index 85a83b745510f..ac2638b62977c 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -272,33 +272,44 @@ cdef class IndexEngine: to the -1 indicies in the results """ cdef: - ndarray values + ndarray values, x ndarray[int64_t] result, missing - object v, val + set stargets + dict d = {} + object val int count = 0, count_missing = 0 - Py_ssize_t i, j, n, found + Py_ssize_t i, j, n, n_t self._ensure_mapping_populated() values = self._get_index_values() + stargets = set(targets) n = len(values) n_t = len(targets) - result = np.empty(n+n_t, dtype=np.int64) + result = np.empty(n*n_t, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) + # form the set of the results (like ismember) + members = np.empty(n, dtype=np.uint8) + for i in range(n): + val = util.get_value_1d(values, i) + if val in stargets: + if val not in d: + d[val] = [] + d[val].append(i) + for i in range(n_t): - val = util.get_value_at(targets, i) - found = 0 - for j in range(n): - v = util.get_value_at(values, j) + val = util.get_value_1d(targets, i) - if v == val: + # found + if val in d: + for j in d[val]: result[count] = j count += 1 - found = 1 # value not found - if found == 0: + else: + result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 0a08fba49afeb..f5e57efdcb166 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -583,7 +583,7 @@ def _combine_const(self, other, func): columns=self.columns) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, - limit=None): + limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') @@ -614,7 +614,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value) - def _reindex_columns(self, columns, copy, level, fill_value, limit=None): + def _reindex_columns(self, columns, copy, level, fill_value, limit=None, + takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 0719d9c9a87db..8b6bf1ed7f651 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -8,7 +8,7 @@ import numpy as np from numpy.testing import assert_array_equal -import pandas as pan +import pandas as pd import pandas.core.common as com from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull, MultiIndex, DatetimeIndex, Timestamp) @@ -1037,6 +1037,36 @@ def test_loc_name(self): result = df.loc[[0, 1]].index.name self.assert_(result == 'index_name') + def test_iloc_non_unique_indexing(self): + + #GH 4017, non-unique indexing (on the axis) + df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000}) + idx = np.array(range(30)) * 99 + expected = df.iloc[idx] + + df3 = pd.concat([df, 2*df, 3*df]) + result = df3.iloc[idx] + + assert_frame_equal(result, expected) + + df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000}) + df2 = pd.concat([df2, 2*df2, 3*df2]) + + sidx = df2.index.to_series() + expected = df2.iloc[idx[idx<=sidx.max()]] + + new_list = [] + for r, s in expected.iterrows(): + new_list.append(s) + new_list.append(s*2) + new_list.append(s*3) + + expected = DataFrame(new_list) + expected = pd.concat([ expected, DataFrame(index=idx[idx>sidx.max()]) ]) + result = df2.loc[idx] + assert_frame_equal(result, expected) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 7847a880918dc..9f07cc6ed15c3 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -148,3 +148,19 @@ indexing_panel_subset = Benchmark('p.ix[inds, inds, inds]', setup, start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# Iloc + +setup = common_setup + """ +df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000}) +idx = np.array(range(30)) * 99 +df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000}) +df2 = concat([df2, 2*df2, 3*df2]) +""" + +frame_iloc_dups = Benchmark('df2.iloc[idx]', setup, + start_date=datetime(2013, 1, 1)) + +frame_loc_dups = Benchmark('df2.loc[idx]', setup, + start_date=datetime(2013, 1, 1))