diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 3c15fdea70858..322c58115de3c 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1261,6 +1261,15 @@ numpy array. For instance, Float64Index ------------ +.. note:: + + As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype + array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype + array. Using a ``float64`` dtype in the backend speeds up arithmetic + operations by about 30x and boolean indexing operations on the + ``Float64Index`` itself are about 2x as fast. + + .. versionadded:: 0.13.0 By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation. diff --git a/doc/source/release.rst b/doc/source/release.rst index 08d7bf9b8728b..2fea35a887f34 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -258,6 +258,8 @@ Improvements to existing features - Performance improvement for ``DataFrame.from_records`` when reading a specified number of rows from an iterable (:issue:`6700`) - :ref:`Holidays and holiday calendars` are now available and can be used with CustomBusinessDay (:issue:`6719`) +- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an + ``object`` dtype array (:issue:`6471`). .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ded10fd75e8d4..11296a43e230d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -468,6 +468,8 @@ Enhancements file. (:issue:`6545`) - ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`) - :ref:`Holidays Calendars` are now available and can be used with CustomBusinessDay (:issue:`6719`) +- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an + ``object`` dtype array (:issue:`6471`). Performance ~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d330d4309b13e..a00b729f1735a 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1843,7 +1843,6 @@ def eval(self, expr, **kwargs): kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) - def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] if values.ndim == 2: @@ -2566,7 +2565,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, ------- sorted : DataFrame """ - + from pandas.core.groupby import _lexsort_indexer, _nargsort axis = self._get_axis_number(axis) if axis not in [0, 1]: # pragma: no cover @@ -2622,7 +2621,7 @@ def trans(v): else: indexer = _nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) - + if inplace: if axis == 1: new_data = self._data.reindex_items( @@ -3285,7 +3284,7 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, code path. This can lead to unexpected behavior if func has side-effects, as they will take effect twice for the first column/row. - + Examples -------- >>> df.apply(numpy.sqrt) # returns DataFrame diff --git a/pandas/core/index.py b/pandas/core/index.py index a581a8753ae51..8748d0081d2e9 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -16,7 +16,8 @@ from pandas.util.decorators import cache_readonly, deprecate from pandas.core.common import isnull, array_equivalent import pandas.core.common as com -from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries +from pandas.core.common import (_values_from_object, is_float, is_integer, + ABCSeries) from pandas.core.config import get_option # simplify @@ -27,6 +28,13 @@ __all__ = ['Index'] +def _try_get_item(x): + try: + return x.item() + except AttributeError: + return x + + def _indexOp(opname): """ Wrapper function for index comparison operations, to avoid @@ -1911,11 +1919,17 @@ class Float64Index(Index): Notes ----- - An Index instance can **only** contain hashable objects + An Float64Index instance can **only** contain hashable objects """ # when this is not longer object dtype this can be changed - #_engine_type = _index.Float64Engine + _engine_type = _index.Float64Engine + _groupby = _algos.groupby_float64 + _arrmap = _algos.arrmap_float64 + _left_indexer_unique = _algos.left_join_indexer_unique_float64 + _left_indexer = _algos.left_join_indexer_float64 + _inner_indexer = _algos.inner_join_indexer_float64 + _outer_indexer = _algos.outer_join_indexer_float64 def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): @@ -1938,9 +1952,9 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') - # coerce to object for storage - if not subarr.dtype == np.object_: - subarr = subarr.astype(object) + # coerce to float64 for storage + if subarr.dtype != np.float64: + subarr = subarr.astype(np.float64) subarr = subarr.view(cls) subarr.name = name @@ -1951,13 +1965,12 @@ def inferred_type(self): return 'floating' def astype(self, dtype): - if np.dtype(dtype) != np.object_: - raise TypeError('Setting %s dtype to anything other than object ' - 'is not supported' % self.__class__) - return Index(self.values, name=self.name, dtype=object) + if np.dtype(dtype) not in (np.object, np.float64): + raise TypeError('Setting %s dtype to anything other than ' + 'float64 or object is not supported' % self.__class__) + return Index(self.values, name=self.name, dtype=dtype) def _convert_scalar_indexer(self, key, typ=None): - if typ == 'iloc': return super(Float64Index, self)._convert_scalar_indexer(key, typ=typ) @@ -1968,8 +1981,6 @@ def _convert_slice_indexer(self, key, typ=None): unless we are iloc """ if typ == 'iloc': return self._convert_slice_indexer_iloc(key) - elif typ == 'getitem': - pass # allow floats here self._validate_slicer( @@ -2008,13 +2019,75 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if self.dtype != other.dtype or self.shape != other.shape: return False + if self.dtype != other.dtype or self.shape != other.shape: + return False left, right = self.values, other.values - return ((left == right) | (isnull(left) & isnull(right))).all() + return ((left == right) | (self._isnan & other._isnan)).all() except TypeError: # e.g. fails in numpy 1.6 with DatetimeIndex #1681 return False + def __contains__(self, other): + if super(Float64Index, self).__contains__(other): + return True + + try: + # if other is a sequence this throws a ValueError + return np.isnan(other) and self._hasnans + except ValueError: + try: + return len(other) <= 1 and _try_get_item(other) in self + except TypeError: + return False + + def get_loc(self, key): + if np.isnan(key): + try: + return self._nan_idxs.item() + except ValueError: + return self._nan_idxs + return super(Float64Index, self).get_loc(key) + + @property + def is_all_dates(self): + """ + Checks that all the labels are datetime objects + """ + return False + + @cache_readonly + def _nan_idxs(self): + w, = self._isnan.nonzero() + return w + + @cache_readonly + def _isnan(self): + return np.isnan(self.values) + + @cache_readonly + def _hasnans(self): + return self._isnan.any() + + @cache_readonly + def is_unique(self): + return super(Float64Index, self).is_unique and self._nan_idxs.size < 2 + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + value_set = set(values) + return lib.ismember_nans(self._array_values(), value_set, + self._hasnans) class MultiIndex(Index): diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index ac68dadd882de..97b6687d061e9 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,4 +1,4 @@ -from khash cimport * +from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t # prototypes for sharing @@ -11,12 +11,11 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) - cdef class Float64HashTable(HashTable): cdef kh_float64_t *table - # cpdef get_item(self, float64_t val) - # cpdef set_item(self, float64_t key, Py_ssize_t val) + cpdef get_item(self, float64_t val) + cpdef set_item(self, float64_t key, Py_ssize_t val) cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index d4ed7fac5d6b7..2b3aa7b52d6c1 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -145,10 +145,6 @@ cdef class HashTable: cdef class StringHashTable(HashTable): cdef kh_str_t *table - # def __init__(self, size_hint=1): - # if size_hint is not None: - # kh_resize_str(self.table, size_hint) - def __cinit__(self, int size_hint=1): self.table = kh_init_str() if size_hint is not None: @@ -539,8 +535,6 @@ cdef class Int64HashTable: #(HashTable): cdef class Float64HashTable(HashTable): - # cdef kh_float64_t *table - def __cinit__(self, size_hint=1): self.table = kh_init_float64() if size_hint is not None: @@ -549,9 +543,34 @@ cdef class Float64HashTable(HashTable): def __len__(self): return self.table.size + cpdef get_item(self, float64_t val): + cdef khiter_t k + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + cpdef set_item(self, float64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_float64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_float64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + def __dealloc__(self): kh_destroy_float64(self.table) + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_float64(self.table, key) + return k != self.table.n_buckets + def factorize(self, ndarray[float64_t] values): uniques = Float64Vector() labels = self.get_labels(values, uniques, 0, -1) diff --git a/pandas/index.pyx b/pandas/index.pyx index e5cfa3f7c6f16..ae209b58136e1 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -398,6 +398,9 @@ cdef class Float64Engine(IndexEngine): cdef _make_hash_table(self, n): return _hash.Float64HashTable(n) + cdef _get_index_values(self): + return algos.ensure_float64(self.vgetter()) + def _call_monotonic(self, values): return algos.is_monotonic_float64(values) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index dccc68ab59ad3..a1fef095ea277 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -92,6 +92,22 @@ cpdef map_indices_list(list index): from libc.stdlib cimport malloc, free + +def ismember_nans(float64_t[:] arr, set values, bint hasnans): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + float64_t val + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + for i in range(n): + val = arr[i] + result[i] = val in values or hasnans and isnan(val) + + return result.view(np.bool_) + + def ismember(ndarray arr, set values): ''' Checks whether @@ -114,10 +130,7 @@ def ismember(ndarray arr, set values): result = np.empty(n, dtype=np.uint8) for i in range(n): val = util.get_value_at(arr, i) - if val in values: - result[i] = 1 - else: - result[i] = 0 + result[i] = val in values return result.view(np.bool_) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f4d90b533a0f7..ecb09ac395417 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -37,6 +37,7 @@ def _skip_if_need_numpy_1_7(): if _np_version_under1p7: raise nose.SkipTest('numpy >= 1.7 required') + class TestIndex(tm.TestCase): _multiprocess_can_split_ = True @@ -835,15 +836,15 @@ def test_constructor(self): self.assertIsInstance(index, Float64Index) index = Float64Index(np.array([1.,2,3,4,5])) self.assertIsInstance(index, Float64Index) - self.assertEqual(index.dtype, object) + self.assertEqual(index.dtype, float) index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32) self.assertIsInstance(index, Float64Index) - self.assertEqual(index.dtype, object) + self.assertEqual(index.dtype, np.float64) index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32) self.assertIsInstance(index, Float64Index) - self.assertEqual(index.dtype, object) + self.assertEqual(index.dtype, np.float64) # nan handling result = Float64Index([np.nan, np.nan]) @@ -904,6 +905,15 @@ def test_equals(self): i2 = Float64Index([1.0,np.nan]) self.assertTrue(i.equals(i2)) + def test_contains_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + self.assertTrue(np.nan in i) + + def test_contains_not_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + self.assertTrue(1.0 in i) + + class TestInt64Index(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index b51ad3e15087c..261e1dd2a590c 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -4,23 +4,20 @@ import warnings from pandas.compat import range, lrange, lzip, StringIO, lmap, map -from numpy import random, nan +from numpy import nan from numpy.random import randn import numpy as np -from numpy.testing import assert_array_equal import pandas as pd import pandas.core.common as com -from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull, - MultiIndex, DatetimeIndex, Float64Index, Timestamp) +from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, + MultiIndex, Float64Index, Timestamp) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal) -from pandas import compat, concat +from pandas import concat import pandas.util.testing as tm -import pandas.lib as lib from pandas import date_range -from numpy.testing.decorators import slow _verbose = False @@ -1201,17 +1198,23 @@ def test_ix_general(self): # ix general issues # GH 2817 - data={'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, - 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, - 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} - df = DataFrame(data).set_index(keys=['col','year']) + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} + df = DataFrame(data).set_index(keys=['col', 'year']) + key = 4.0, 2012 # this should raise correct error - self.assertRaises(KeyError, df.ix.__getitem__, tuple([4.0,2012])) + with tm.assertRaises(KeyError): + df.ix[key] # this is ok df.sortlevel(inplace=True) - df.ix[(4.0,2012)] + res = df.ix[key] + index = MultiIndex.from_arrays([[4] * 3, [2012] * 3], + names=['col', 'year']) + expected = DataFrame({'amount': [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) def test_ix_weird_slicing(self): ## http://stackoverflow.com/q/17056560/1240268 diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index cb13d63cd726c..5a98481a689a6 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -59,3 +59,49 @@ index_str_slice_indexer_even = Benchmark('idx[::2]', setup) index_str_boolean_indexer = Benchmark('idx[mask]', setup) index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup) + +#---------------------------------------------------------------------- +# float64 index +#---------------------------------------------------------------------- +# construction +setup = common_setup + """ +baseidx = np.arange(1e6) +""" + +index_float64_construct = Benchmark('Index(baseidx)', setup, + name='index_float64_construct', + start_date=datetime(2014, 4, 13)) + +setup = common_setup + """ +idx = tm.makeFloatIndex(1000000) + +mask = np.arange(idx.size) % 3 == 0 +series_mask = Series(mask) +""" +#---------------------------------------------------------------------- +# getting +index_float64_get = Benchmark('idx[1]', setup, name='index_float64_get', + start_date=datetime(2014, 4, 13)) + + +#---------------------------------------------------------------------- +# slicing +index_float64_slice_indexer_basic = Benchmark('idx[:-1]', setup, + name='index_float64_slice_indexer_basic', + start_date=datetime(2014, 4, 13)) +index_float64_slice_indexer_even = Benchmark('idx[::2]', setup, + name='index_float64_slice_indexer_even', + start_date=datetime(2014, 4, 13)) +index_float64_boolean_indexer = Benchmark('idx[mask]', setup, + name='index_float64_boolean_indexer', + start_date=datetime(2014, 4, 13)) +index_float64_boolean_series_indexer = Benchmark('idx[series_mask]', setup, + name='index_float64_boolean_series_indexer', + start_date=datetime(2014, 4, 13)) + +#---------------------------------------------------------------------- +# arith ops +index_float64_mul = Benchmark('idx * 2', setup, name='index_float64_mul', + start_date=datetime(2014, 4, 13)) +index_float64_div = Benchmark('idx / 2', setup, name='index_float64_div', + start_date=datetime(2014, 4, 13))