Skip to content

ENH: Float64Index now uses Float64Hashtable as a backend #6879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,15 @@ numpy array. For instance,
Float64Index
------------

.. note::

As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype
array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype
array. Using a ``float64`` dtype in the backend speeds up arithmetic
operations by about 30x and boolean indexing operations on the
``Float64Index`` itself are about 2x as fast.


.. versionadded:: 0.13.0

By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation.
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ Improvements to existing features
- Performance improvement for ``DataFrame.from_records`` when reading a
specified number of rows from an iterable (:issue:`6700`)
- :ref:`Holidays and holiday calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
``object`` dtype array (:issue:`6471`).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add same in. v0.14 API section


.. _release.bug_fixes-0.14.0:

Expand Down
2 changes: 2 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,8 @@ Enhancements
file. (:issue:`6545`)
- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`)
- :ref:`Holidays Calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
``object`` dtype array (:issue:`6471`).

Performance
~~~~~~~~~~~
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,7 +1843,6 @@ def eval(self, expr, **kwargs):
kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
return _eval(expr, **kwargs)


def _box_item_values(self, key, values):
items = self.columns[self.columns.get_loc(key)]
if values.ndim == 2:
Expand Down Expand Up @@ -2566,7 +2565,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
-------
sorted : DataFrame
"""

from pandas.core.groupby import _lexsort_indexer, _nargsort
axis = self._get_axis_number(axis)
if axis not in [0, 1]: # pragma: no cover
Expand Down Expand Up @@ -2622,7 +2621,7 @@ def trans(v):
else:
indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

if inplace:
if axis == 1:
new_data = self._data.reindex_items(
Expand Down Expand Up @@ -3285,7 +3284,7 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
code path. This can lead to unexpected behavior if func has
side-effects, as they will take effect twice for the first
column/row.

Examples
--------
>>> df.apply(numpy.sqrt) # returns DataFrame
Expand Down
103 changes: 88 additions & 15 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from pandas.util.decorators import cache_readonly, deprecate
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
from pandas.core.common import (_values_from_object, is_float, is_integer,
ABCSeries)
from pandas.core.config import get_option

# simplify
Expand All @@ -27,6 +28,13 @@
__all__ = ['Index']


def _try_get_item(x):
try:
return x.item()
except AttributeError:
return x


def _indexOp(opname):
"""
Wrapper function for index comparison operations, to avoid
Expand Down Expand Up @@ -1911,11 +1919,17 @@ class Float64Index(Index):

Notes
-----
An Index instance can **only** contain hashable objects
An Float64Index instance can **only** contain hashable objects
"""

# when this is not longer object dtype this can be changed
#_engine_type = _index.Float64Engine
_engine_type = _index.Float64Engine
_groupby = _algos.groupby_float64
_arrmap = _algos.arrmap_float64
_left_indexer_unique = _algos.left_join_indexer_unique_float64
_left_indexer = _algos.left_join_indexer_float64
_inner_indexer = _algos.inner_join_indexer_float64
_outer_indexer = _algos.outer_join_indexer_float64

def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):

Expand All @@ -1938,9 +1952,9 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
raise TypeError('Unsafe NumPy casting, you must '
'explicitly cast')

# coerce to object for storage
if not subarr.dtype == np.object_:
subarr = subarr.astype(object)
# coerce to float64 for storage
if subarr.dtype != np.float64:
subarr = subarr.astype(np.float64)

subarr = subarr.view(cls)
subarr.name = name
Expand All @@ -1951,13 +1965,12 @@ def inferred_type(self):
return 'floating'

def astype(self, dtype):
if np.dtype(dtype) != np.object_:
raise TypeError('Setting %s dtype to anything other than object '
'is not supported' % self.__class__)
return Index(self.values, name=self.name, dtype=object)
if np.dtype(dtype) not in (np.object, np.float64):
raise TypeError('Setting %s dtype to anything other than '
'float64 or object is not supported' % self.__class__)
return Index(self.values, name=self.name, dtype=dtype)

def _convert_scalar_indexer(self, key, typ=None):

if typ == 'iloc':
return super(Float64Index, self)._convert_scalar_indexer(key,
typ=typ)
Expand All @@ -1968,8 +1981,6 @@ def _convert_slice_indexer(self, key, typ=None):
unless we are iloc """
if typ == 'iloc':
return self._convert_slice_indexer_iloc(key)
elif typ == 'getitem':
pass

# allow floats here
self._validate_slicer(
Expand Down Expand Up @@ -2008,13 +2019,75 @@ def equals(self, other):
try:
if not isinstance(other, Float64Index):
other = self._constructor(other)
if self.dtype != other.dtype or self.shape != other.shape: return False
if self.dtype != other.dtype or self.shape != other.shape:
return False
left, right = self.values, other.values
return ((left == right) | (isnull(left) & isnull(right))).all()
return ((left == right) | (self._isnan & other._isnan)).all()
except TypeError:
# e.g. fails in numpy 1.6 with DatetimeIndex #1681
return False

def __contains__(self, other):
if super(Float64Index, self).__contains__(other):
return True

try:
# if other is a sequence this throws a ValueError
return np.isnan(other) and self._hasnans
except ValueError:
try:
return len(other) <= 1 and _try_get_item(other) in self
except TypeError:
return False

def get_loc(self, key):
if np.isnan(key):
try:
return self._nan_idxs.item()
except ValueError:
return self._nan_idxs
return super(Float64Index, self).get_loc(key)

@property
def is_all_dates(self):
"""
Checks that all the labels are datetime objects
"""
return False

@cache_readonly
def _nan_idxs(self):
w, = self._isnan.nonzero()
return w

@cache_readonly
def _isnan(self):
return np.isnan(self.values)

@cache_readonly
def _hasnans(self):
return self._isnan.any()

@cache_readonly
def is_unique(self):
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2

def isin(self, values):
"""
Compute boolean array of whether each index value is found in the
passed set of values

Parameters
----------
values : set or sequence of values

Returns
-------
is_contained : ndarray (boolean dtype)
"""
value_set = set(values)
return lib.ismember_nans(self._array_values(), value_set,
self._hasnans)

class MultiIndex(Index):

Expand Down
7 changes: 3 additions & 4 deletions pandas/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from khash cimport *
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t

# prototypes for sharing

Expand All @@ -11,12 +11,11 @@ cdef class Int64HashTable(HashTable):
cpdef get_item(self, int64_t val)
cpdef set_item(self, int64_t key, Py_ssize_t val)


cdef class Float64HashTable(HashTable):
cdef kh_float64_t *table

# cpdef get_item(self, float64_t val)
# cpdef set_item(self, float64_t key, Py_ssize_t val)
cpdef get_item(self, float64_t val)
cpdef set_item(self, float64_t key, Py_ssize_t val)

cdef class PyObjectHashTable(HashTable):
cdef kh_pymap_t *table
Expand Down
31 changes: 25 additions & 6 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,6 @@ cdef class HashTable:
cdef class StringHashTable(HashTable):
cdef kh_str_t *table

# def __init__(self, size_hint=1):
# if size_hint is not None:
# kh_resize_str(self.table, size_hint)

def __cinit__(self, int size_hint=1):
self.table = kh_init_str()
if size_hint is not None:
Expand Down Expand Up @@ -539,8 +535,6 @@ cdef class Int64HashTable: #(HashTable):


cdef class Float64HashTable(HashTable):
# cdef kh_float64_t *table

def __cinit__(self, size_hint=1):
self.table = kh_init_float64()
if size_hint is not None:
Expand All @@ -549,9 +543,34 @@ cdef class Float64HashTable(HashTable):
def __len__(self):
return self.table.size

cpdef get_item(self, float64_t val):
cdef khiter_t k
k = kh_get_float64(self.table, val)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)

cpdef set_item(self, float64_t key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0

k = kh_put_float64(self.table, key, &ret)
self.table.keys[k] = key
if kh_exist_float64(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)

def __dealloc__(self):
kh_destroy_float64(self.table)

def __contains__(self, object key):
cdef khiter_t k
k = kh_get_float64(self.table, key)
return k != self.table.n_buckets

def factorize(self, ndarray[float64_t] values):
uniques = Float64Vector()
labels = self.get_labels(values, uniques, 0, -1)
Expand Down
3 changes: 3 additions & 0 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,9 @@ cdef class Float64Engine(IndexEngine):
cdef _make_hash_table(self, n):
return _hash.Float64HashTable(n)

cdef _get_index_values(self):
return algos.ensure_float64(self.vgetter())

def _call_monotonic(self, values):
return algos.is_monotonic_float64(values)

Expand Down
21 changes: 17 additions & 4 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ cpdef map_indices_list(list index):

from libc.stdlib cimport malloc, free


def ismember_nans(float64_t[:] arr, set values, bint hasnans):
cdef:
Py_ssize_t i, n
ndarray[uint8_t] result
float64_t val

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i in range(n):
val = arr[i]
result[i] = val in values or hasnans and isnan(val)

return result.view(np.bool_)


def ismember(ndarray arr, set values):
'''
Checks whether
Expand All @@ -114,10 +130,7 @@ def ismember(ndarray arr, set values):
result = np.empty(n, dtype=np.uint8)
for i in range(n):
val = util.get_value_at(arr, i)
if val in values:
result[i] = 1
else:
result[i] = 0
result[i] = val in values

return result.view(np.bool_)

Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def _skip_if_need_numpy_1_7():
if _np_version_under1p7:
raise nose.SkipTest('numpy >= 1.7 required')


class TestIndex(tm.TestCase):
_multiprocess_can_split_ = True

Expand Down Expand Up @@ -835,15 +836,15 @@ def test_constructor(self):
self.assertIsInstance(index, Float64Index)
index = Float64Index(np.array([1.,2,3,4,5]))
self.assertIsInstance(index, Float64Index)
self.assertEqual(index.dtype, object)
self.assertEqual(index.dtype, float)

index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32)
self.assertIsInstance(index, Float64Index)
self.assertEqual(index.dtype, object)
self.assertEqual(index.dtype, np.float64)

index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32)
self.assertIsInstance(index, Float64Index)
self.assertEqual(index.dtype, object)
self.assertEqual(index.dtype, np.float64)

# nan handling
result = Float64Index([np.nan, np.nan])
Expand Down Expand Up @@ -904,6 +905,15 @@ def test_equals(self):
i2 = Float64Index([1.0,np.nan])
self.assertTrue(i.equals(i2))

def test_contains_nans(self):
i = Float64Index([1.0, 2.0, np.nan])
self.assertTrue(np.nan in i)

def test_contains_not_nans(self):
i = Float64Index([1.0, 2.0, np.nan])
self.assertTrue(1.0 in i)


class TestInt64Index(tm.TestCase):
_multiprocess_can_split_ = True

Expand Down
Loading