ENH: Use Float64HashTable for Float64Index backend

cpcloud · cpcloud · commit 7e394d023e02 · 2014-04-13T21:27:28.000-04:00
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1261,6 +1261,15 @@ numpy array.  For instance,
 Float64Index
 ------------
 
+.. note::
+
+   As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype
+   array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype
+   array. Using a ``float64`` dtype in the backend speeds up arithmetic
+   operations by about 30x and boolean indexing operations on the
+   ``Float64Index`` itself are about 2x as fast.
+
+
 .. versionadded:: 0.13.0
 
 By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -258,6 +258,8 @@ Improvements to existing features
 - Performance improvement for ``DataFrame.from_records`` when reading a
   specified number of rows from an iterable (:issue:`6700`)
 - :ref:`Holidays and holiday calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
+- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
+  ``object`` dtype array (:issue:`6471`).
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -468,6 +468,8 @@ Enhancements
   file. (:issue:`6545`)
 - ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`)
 - :ref:`Holidays Calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
+- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
+  ``object`` dtype array (:issue:`6471`).
 
 Performance
 ~~~~~~~~~~~
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1843,7 +1843,6 @@ def eval(self, expr, **kwargs):
         kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
         return _eval(expr, **kwargs)
 
-
     def _box_item_values(self, key, values):
         items = self.columns[self.columns.get_loc(key)]
         if values.ndim == 2:
@@ -2566,7 +2565,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
         -------
         sorted : DataFrame
         """
-        
+
         from pandas.core.groupby import _lexsort_indexer, _nargsort
         axis = self._get_axis_number(axis)
         if axis not in [0, 1]:  # pragma: no cover
@@ -2622,7 +2621,7 @@ def trans(v):
         else:
             indexer = _nargsort(labels, kind=kind, ascending=ascending,
                                 na_position=na_position)
-            
+
         if inplace:
             if axis == 1:
                 new_data = self._data.reindex_items(
@@ -3285,7 +3284,7 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
         code path. This can lead to unexpected behavior if func has
         side-effects, as they will take effect twice for the first
         column/row.
-        
+
         Examples
         --------
         >>> df.apply(numpy.sqrt) # returns DataFrame
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -16,7 +16,8 @@
 from pandas.util.decorators import cache_readonly, deprecate
 from pandas.core.common import isnull, array_equivalent
 import pandas.core.common as com
-from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
+from pandas.core.common import (_values_from_object, is_float, is_integer,
+                                ABCSeries)
 from pandas.core.config import get_option
 
 # simplify
@@ -27,6 +28,13 @@
 __all__ = ['Index']
 
 
+def _try_get_item(x):
+    try:
+        return x.item()
+    except AttributeError:
+        return x
+
+
 def _indexOp(opname):
     """
     Wrapper function for index comparison operations, to avoid
@@ -1911,11 +1919,17 @@ class Float64Index(Index):
 
     Notes
     -----
-    An Index instance can **only** contain hashable objects
+    An Float64Index instance can **only** contain hashable objects
     """
 
     # when this is not longer object dtype this can be changed
-    #_engine_type = _index.Float64Engine
+    _engine_type = _index.Float64Engine
+    _groupby = _algos.groupby_float64
+    _arrmap = _algos.arrmap_float64
+    _left_indexer_unique = _algos.left_join_indexer_unique_float64
+    _left_indexer = _algos.left_join_indexer_float64
+    _inner_indexer = _algos.inner_join_indexer_float64
+    _outer_indexer = _algos.outer_join_indexer_float64
 
     def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
 
@@ -1938,9 +1952,9 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
             raise TypeError('Unsafe NumPy casting, you must '
                             'explicitly cast')
 
-        # coerce to object for storage
-        if not subarr.dtype == np.object_:
-            subarr = subarr.astype(object)
+        # coerce to float64 for storage
+        if subarr.dtype != np.float64:
+            subarr = subarr.astype(np.float64)
 
         subarr = subarr.view(cls)
         subarr.name = name
@@ -1951,13 +1965,12 @@ def inferred_type(self):
         return 'floating'
 
     def astype(self, dtype):
-        if np.dtype(dtype) != np.object_:
-            raise TypeError('Setting %s dtype to anything other than object '
-                            'is not supported' % self.__class__)
-        return Index(self.values, name=self.name, dtype=object)
+        if np.dtype(dtype) not in (np.object, np.float64):
+            raise TypeError('Setting %s dtype to anything other than '
+                            'float64 or object is not supported' % self.__class__)
+        return Index(self.values, name=self.name, dtype=dtype)
 
     def _convert_scalar_indexer(self, key, typ=None):
-
         if typ == 'iloc':
             return super(Float64Index, self)._convert_scalar_indexer(key,
                                                                      typ=typ)
@@ -1968,8 +1981,6 @@ def _convert_slice_indexer(self, key, typ=None):
             unless we are iloc """
         if typ == 'iloc':
             return self._convert_slice_indexer_iloc(key)
-        elif typ == 'getitem':
-            pass
 
         # allow floats here
         self._validate_slicer(
@@ -2008,13 +2019,75 @@ def equals(self, other):
         try:
             if not isinstance(other, Float64Index):
                 other = self._constructor(other)
-            if self.dtype != other.dtype or self.shape != other.shape: return False
+            if self.dtype != other.dtype or self.shape != other.shape:
+                return False
             left, right = self.values, other.values
-            return ((left == right) | (isnull(left) & isnull(right))).all()
+            return ((left == right) | (self._isnan & other._isnan)).all()
         except TypeError:
             # e.g. fails in numpy 1.6 with DatetimeIndex #1681
             return False
 
+    def __contains__(self, other):
+        if super(Float64Index, self).__contains__(other):
+            return True
+
+        try:
+            # if other is a sequence this throws a ValueError
+            return np.isnan(other) and self._hasnans
+        except ValueError:
+            try:
+                return len(other) <= 1 and _try_get_item(other) in self
+            except TypeError:
+                return False
+
+    def get_loc(self, key):
+        if np.isnan(key):
+            try:
+                return self._nan_idxs.item()
+            except ValueError:
+                return self._nan_idxs
+        return super(Float64Index, self).get_loc(key)
+
+    @property
+    def is_all_dates(self):
+        """
+        Checks that all the labels are datetime objects
+        """
+        return False
+
+    @cache_readonly
+    def _nan_idxs(self):
+        w, = self._isnan.nonzero()
+        return w
+
+    @cache_readonly
+    def _isnan(self):
+        return np.isnan(self.values)
+
+    @cache_readonly
+    def _hasnans(self):
+        return self._isnan.any()
+
+    @cache_readonly
+    def is_unique(self):
+        return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
+
+    def isin(self, values):
+        """
+        Compute boolean array of whether each index value is found in the
+        passed set of values
+
+        Parameters
+        ----------
+        values : set or sequence of values
+
+        Returns
+        -------
+        is_contained : ndarray (boolean dtype)
+        """
+        value_set = set(values)
+        return lib.ismember_nans(self._array_values(), value_set,
+                                 self._hasnans)
 
 class MultiIndex(Index):
 
diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd
@@ -1,4 +1,4 @@
-from khash cimport *
+from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
 
 # prototypes for sharing
 
@@ -11,12 +11,11 @@ cdef class Int64HashTable(HashTable):
     cpdef get_item(self, int64_t val)
     cpdef set_item(self, int64_t key, Py_ssize_t val)
 
-
 cdef class Float64HashTable(HashTable):
     cdef kh_float64_t *table
 
-    # cpdef get_item(self, float64_t val)
-    # cpdef set_item(self, float64_t key, Py_ssize_t val)
+    cpdef get_item(self, float64_t val)
+    cpdef set_item(self, float64_t key, Py_ssize_t val)
 
 cdef class PyObjectHashTable(HashTable):
     cdef kh_pymap_t *table
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -145,10 +145,6 @@ cdef class HashTable:
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table
 
-    # def __init__(self, size_hint=1):
-    #     if size_hint is not None:
-    #         kh_resize_str(self.table, size_hint)
-
     def __cinit__(self, int size_hint=1):
         self.table = kh_init_str()
         if size_hint is not None:
@@ -539,8 +535,6 @@ cdef class Int64HashTable: #(HashTable):
 
 
 cdef class Float64HashTable(HashTable):
-    # cdef kh_float64_t *table
-
     def __cinit__(self, size_hint=1):
         self.table = kh_init_float64()
         if size_hint is not None:
@@ -549,9 +543,34 @@ cdef class Float64HashTable(HashTable):
     def __len__(self):
         return self.table.size
 
+    cpdef get_item(self, float64_t val):
+        cdef khiter_t k
+        k = kh_get_float64(self.table, val)
+        if k != self.table.n_buckets:
+            return self.table.vals[k]
+        else:
+            raise KeyError(val)
+
+    cpdef set_item(self, float64_t key, Py_ssize_t val):
+        cdef:
+            khiter_t k
+            int ret = 0
+
+        k = kh_put_float64(self.table, key, &ret)
+        self.table.keys[k] = key
+        if kh_exist_float64(self.table, k):
+            self.table.vals[k] = val
+        else:
+            raise KeyError(key)
+
     def __dealloc__(self):
         kh_destroy_float64(self.table)
 
+    def __contains__(self, object key):
+        cdef khiter_t k
+        k = kh_get_float64(self.table, key)
+        return k != self.table.n_buckets
+
     def factorize(self, ndarray[float64_t] values):
         uniques = Float64Vector()
         labels = self.get_labels(values, uniques, 0, -1)
diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -398,6 +398,9 @@ cdef class Float64Engine(IndexEngine):
     cdef _make_hash_table(self, n):
         return _hash.Float64HashTable(n)
 
+    cdef _get_index_values(self):
+        return algos.ensure_float64(self.vgetter())
+
     def _call_monotonic(self, values):
         return algos.is_monotonic_float64(values)
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -92,6 +92,22 @@ cpdef map_indices_list(list index):
 
 from libc.stdlib cimport malloc, free
 
+
+def ismember_nans(float64_t[:] arr, set values, bint hasnans):
+    cdef:
+        Py_ssize_t i, n
+        ndarray[uint8_t] result
+        float64_t val
+
+    n = len(arr)
+    result = np.empty(n, dtype=np.uint8)
+    for i in range(n):
+        val = arr[i]
+        result[i] = val in values or hasnans and isnan(val)
+
+    return result.view(np.bool_)
+
+
 def ismember(ndarray arr, set values):
     '''
     Checks whether
@@ -114,10 +130,7 @@ def ismember(ndarray arr, set values):
     result = np.empty(n, dtype=np.uint8)
     for i in range(n):
         val = util.get_value_at(arr, i)
-        if val in values:
-            result[i] = 1
-        else:
-            result[i] = 0
+        result[i] = val in values
 
     return result.view(np.bool_)
 
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -37,6 +37,7 @@ def _skip_if_need_numpy_1_7():
     if _np_version_under1p7:
         raise nose.SkipTest('numpy >= 1.7 required')
 
+
 class TestIndex(tm.TestCase):
     _multiprocess_can_split_ = True
 
@@ -835,15 +836,15 @@ def test_constructor(self):
         self.assertIsInstance(index, Float64Index)
         index = Float64Index(np.array([1.,2,3,4,5]))
         self.assertIsInstance(index, Float64Index)
-        self.assertEqual(index.dtype, object)
+        self.assertEqual(index.dtype, float)
 
         index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32)
         self.assertIsInstance(index, Float64Index)
-        self.assertEqual(index.dtype, object)
+        self.assertEqual(index.dtype, np.float64)
 
         index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32)
         self.assertIsInstance(index, Float64Index)
-        self.assertEqual(index.dtype, object)
+        self.assertEqual(index.dtype, np.float64)
 
         # nan handling
         result = Float64Index([np.nan, np.nan])
@@ -904,6 +905,15 @@ def test_equals(self):
         i2 = Float64Index([1.0,np.nan])
         self.assertTrue(i.equals(i2))
 
+    def test_contains_nans(self):
+        i = Float64Index([1.0, 2.0, np.nan])
+        self.assertTrue(np.nan in i)
+
+    def test_contains_not_nans(self):
+        i = Float64Index([1.0, 2.0, np.nan])
+        self.assertTrue(1.0 in i)
+
+
 class TestInt64Index(tm.TestCase):
     _multiprocess_can_split_ = True
 
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py