From 205b5cb17f481b3c60b6a729bee0fb4c738b2ab4 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 21 Aug 2013 23:10:46 -0400 Subject: [PATCH] PERF: optimize __getitem__ in DataFrame for columns lookup --- doc/source/release.rst | 4 +++- pandas/core/frame.py | 25 +++++++++++++++++++------ vb_suite/frame_methods.py | 17 +++++++++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 745876e8c448e..578e235b0f88b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -59,6 +59,8 @@ pandas 0.13 - A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object to yield a ``float64`` dtyped Series. This is frequency conversion. + - Performance improvements with ``__getitem__`` on ``DataFrames`` with + when the key is a column **API Changes** @@ -183,7 +185,7 @@ See :ref:`Internal Refactoring` - Refactor ``Series.reindex`` to core/generic.py (:issue:`4604`, :issue:`4618`), allow ``method=`` in reindexing on a Series to work - ``Series.copy`` no longer accepts the ``order`` parameter and is now consistent with ``NDFrame`` copy -- Refactor ``rename`` methods to core/generic.py; fixes ``Series.rename`` for (:issue`4605`), and adds ``rename`` +- Refactor ``rename`` methods to core/generic.py; fixes ``Series.rename`` for (:issue:`4605`), and adds ``rename`` with the same signature for ``Panel`` **Experimental Features** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fce6896027867..31f7179f8e328 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1810,6 +1810,14 @@ def iget_value(self, i, j): def __getitem__(self, key): + # shortcut if we are an actual column + is_mi_columns = isinstance(self.columns, MultiIndex) + try: + if key in self.columns and not is_mi_columns: + return self._getitem_column(key) + except: + pass + # see if we can slice the rows indexer = _convert_to_index_sliceable(self, key) if indexer is not None: @@ -1820,15 +1828,20 @@ def __getitem__(self, key): return self._getitem_array(key) elif isinstance(key, DataFrame): return self._getitem_frame(key) - elif isinstance(self.columns, MultiIndex): + elif is_mi_columns: return self._getitem_multilevel(key) else: - # get column - if self.columns.is_unique: - return self._get_item_cache(key) + return self._getitem_column(key) + + def _getitem_column(self, key): + """ return the actual column """ + + # get column + if self.columns.is_unique: + return self._get_item_cache(key) - # duplicate columns - return self._constructor(self._data.get(key)) + # duplicate columns + return self._constructor(self._data.get(key)) def _getitem_slice(self, key): return self._slice(key, axis=0) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index f6909802f2d77..67c0aa227f886 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -84,6 +84,8 @@ setup = common_setup + """ df = DataFrame(randn(10000, 1000)) +df2 = DataFrame(randn(3000,1),columns=['A']) +df3 = DataFrame(randn(3000,1)) def f(): if hasattr(df, '_item_cache'): @@ -94,6 +96,15 @@ def f(): def g(): for name, col in df.iteritems(): pass + +def h(): + for i in xrange(10000): + df2['A'] + +def j(): + for i in xrange(10000): + df3[0] + """ # as far back as the earliest test currently in the suite @@ -103,6 +114,12 @@ def g(): frame_iteritems_cached = Benchmark('g()', setup, start_date=datetime(2010, 6, 1)) +frame_getitem_single_column = Benchmark('h()', setup, + start_date=datetime(2010, 6, 1)) + +frame_getitem_single_column2 = Benchmark('j()', setup, + start_date=datetime(2010, 6, 1)) + #---------------------------------------------------------------------- # to_string