Skip to content

Commit 7e394d0

Browse files
committed
ENH: Use Float64HashTable for Float64Index backend
1 parent f7205b5 commit 7e394d0

File tree

12 files changed

+227
-49
lines changed

12 files changed

+227
-49
lines changed

doc/source/indexing.rst

+9
Original file line numberDiff line numberDiff line change
@@ -1261,6 +1261,15 @@ numpy array. For instance,
12611261
Float64Index
12621262
------------
12631263

1264+
.. note::
1265+
1266+
As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype
1267+
array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype
1268+
array. Using a ``float64`` dtype in the backend speeds up arithmetic
1269+
operations by about 30x and boolean indexing operations on the
1270+
``Float64Index`` itself are about 2x as fast.
1271+
1272+
12641273
.. versionadded:: 0.13.0
12651274

12661275
By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation.

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ Improvements to existing features
258258
- Performance improvement for ``DataFrame.from_records`` when reading a
259259
specified number of rows from an iterable (:issue:`6700`)
260260
- :ref:`Holidays and holiday calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
261+
- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
262+
``object`` dtype array (:issue:`6471`).
261263

262264
.. _release.bug_fixes-0.14.0:
263265

doc/source/v0.14.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,8 @@ Enhancements
468468
file. (:issue:`6545`)
469469
- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`)
470470
- :ref:`Holidays Calendars<timeseries.holiday>` are now available and can be used with CustomBusinessDay (:issue:`6719`)
471+
- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an
472+
``object`` dtype array (:issue:`6471`).
471473

472474
Performance
473475
~~~~~~~~~~~

pandas/core/frame.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1843,7 +1843,6 @@ def eval(self, expr, **kwargs):
18431843
kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
18441844
return _eval(expr, **kwargs)
18451845

1846-
18471846
def _box_item_values(self, key, values):
18481847
items = self.columns[self.columns.get_loc(key)]
18491848
if values.ndim == 2:
@@ -2566,7 +2565,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
25662565
-------
25672566
sorted : DataFrame
25682567
"""
2569-
2568+
25702569
from pandas.core.groupby import _lexsort_indexer, _nargsort
25712570
axis = self._get_axis_number(axis)
25722571
if axis not in [0, 1]: # pragma: no cover
@@ -2622,7 +2621,7 @@ def trans(v):
26222621
else:
26232622
indexer = _nargsort(labels, kind=kind, ascending=ascending,
26242623
na_position=na_position)
2625-
2624+
26262625
if inplace:
26272626
if axis == 1:
26282627
new_data = self._data.reindex_items(
@@ -3285,7 +3284,7 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
32853284
code path. This can lead to unexpected behavior if func has
32863285
side-effects, as they will take effect twice for the first
32873286
column/row.
3288-
3287+
32893288
Examples
32903289
--------
32913290
>>> df.apply(numpy.sqrt) # returns DataFrame

pandas/core/index.py

+88-15
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
from pandas.util.decorators import cache_readonly, deprecate
1717
from pandas.core.common import isnull, array_equivalent
1818
import pandas.core.common as com
19-
from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
19+
from pandas.core.common import (_values_from_object, is_float, is_integer,
20+
ABCSeries)
2021
from pandas.core.config import get_option
2122

2223
# simplify
@@ -27,6 +28,13 @@
2728
__all__ = ['Index']
2829

2930

31+
def _try_get_item(x):
32+
try:
33+
return x.item()
34+
except AttributeError:
35+
return x
36+
37+
3038
def _indexOp(opname):
3139
"""
3240
Wrapper function for index comparison operations, to avoid
@@ -1911,11 +1919,17 @@ class Float64Index(Index):
19111919
19121920
Notes
19131921
-----
1914-
An Index instance can **only** contain hashable objects
1922+
An Float64Index instance can **only** contain hashable objects
19151923
"""
19161924

19171925
# when this is not longer object dtype this can be changed
1918-
#_engine_type = _index.Float64Engine
1926+
_engine_type = _index.Float64Engine
1927+
_groupby = _algos.groupby_float64
1928+
_arrmap = _algos.arrmap_float64
1929+
_left_indexer_unique = _algos.left_join_indexer_unique_float64
1930+
_left_indexer = _algos.left_join_indexer_float64
1931+
_inner_indexer = _algos.inner_join_indexer_float64
1932+
_outer_indexer = _algos.outer_join_indexer_float64
19191933

19201934
def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
19211935

@@ -1938,9 +1952,9 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
19381952
raise TypeError('Unsafe NumPy casting, you must '
19391953
'explicitly cast')
19401954

1941-
# coerce to object for storage
1942-
if not subarr.dtype == np.object_:
1943-
subarr = subarr.astype(object)
1955+
# coerce to float64 for storage
1956+
if subarr.dtype != np.float64:
1957+
subarr = subarr.astype(np.float64)
19441958

19451959
subarr = subarr.view(cls)
19461960
subarr.name = name
@@ -1951,13 +1965,12 @@ def inferred_type(self):
19511965
return 'floating'
19521966

19531967
def astype(self, dtype):
1954-
if np.dtype(dtype) != np.object_:
1955-
raise TypeError('Setting %s dtype to anything other than object '
1956-
'is not supported' % self.__class__)
1957-
return Index(self.values, name=self.name, dtype=object)
1968+
if np.dtype(dtype) not in (np.object, np.float64):
1969+
raise TypeError('Setting %s dtype to anything other than '
1970+
'float64 or object is not supported' % self.__class__)
1971+
return Index(self.values, name=self.name, dtype=dtype)
19581972

19591973
def _convert_scalar_indexer(self, key, typ=None):
1960-
19611974
if typ == 'iloc':
19621975
return super(Float64Index, self)._convert_scalar_indexer(key,
19631976
typ=typ)
@@ -1968,8 +1981,6 @@ def _convert_slice_indexer(self, key, typ=None):
19681981
unless we are iloc """
19691982
if typ == 'iloc':
19701983
return self._convert_slice_indexer_iloc(key)
1971-
elif typ == 'getitem':
1972-
pass
19731984

19741985
# allow floats here
19751986
self._validate_slicer(
@@ -2008,13 +2019,75 @@ def equals(self, other):
20082019
try:
20092020
if not isinstance(other, Float64Index):
20102021
other = self._constructor(other)
2011-
if self.dtype != other.dtype or self.shape != other.shape: return False
2022+
if self.dtype != other.dtype or self.shape != other.shape:
2023+
return False
20122024
left, right = self.values, other.values
2013-
return ((left == right) | (isnull(left) & isnull(right))).all()
2025+
return ((left == right) | (self._isnan & other._isnan)).all()
20142026
except TypeError:
20152027
# e.g. fails in numpy 1.6 with DatetimeIndex #1681
20162028
return False
20172029

2030+
def __contains__(self, other):
2031+
if super(Float64Index, self).__contains__(other):
2032+
return True
2033+
2034+
try:
2035+
# if other is a sequence this throws a ValueError
2036+
return np.isnan(other) and self._hasnans
2037+
except ValueError:
2038+
try:
2039+
return len(other) <= 1 and _try_get_item(other) in self
2040+
except TypeError:
2041+
return False
2042+
2043+
def get_loc(self, key):
2044+
if np.isnan(key):
2045+
try:
2046+
return self._nan_idxs.item()
2047+
except ValueError:
2048+
return self._nan_idxs
2049+
return super(Float64Index, self).get_loc(key)
2050+
2051+
@property
2052+
def is_all_dates(self):
2053+
"""
2054+
Checks that all the labels are datetime objects
2055+
"""
2056+
return False
2057+
2058+
@cache_readonly
2059+
def _nan_idxs(self):
2060+
w, = self._isnan.nonzero()
2061+
return w
2062+
2063+
@cache_readonly
2064+
def _isnan(self):
2065+
return np.isnan(self.values)
2066+
2067+
@cache_readonly
2068+
def _hasnans(self):
2069+
return self._isnan.any()
2070+
2071+
@cache_readonly
2072+
def is_unique(self):
2073+
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
2074+
2075+
def isin(self, values):
2076+
"""
2077+
Compute boolean array of whether each index value is found in the
2078+
passed set of values
2079+
2080+
Parameters
2081+
----------
2082+
values : set or sequence of values
2083+
2084+
Returns
2085+
-------
2086+
is_contained : ndarray (boolean dtype)
2087+
"""
2088+
value_set = set(values)
2089+
return lib.ismember_nans(self._array_values(), value_set,
2090+
self._hasnans)
20182091

20192092
class MultiIndex(Index):
20202093

pandas/hashtable.pxd

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from khash cimport *
1+
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t
22

33
# prototypes for sharing
44

@@ -11,12 +11,11 @@ cdef class Int64HashTable(HashTable):
1111
cpdef get_item(self, int64_t val)
1212
cpdef set_item(self, int64_t key, Py_ssize_t val)
1313

14-
1514
cdef class Float64HashTable(HashTable):
1615
cdef kh_float64_t *table
1716

18-
# cpdef get_item(self, float64_t val)
19-
# cpdef set_item(self, float64_t key, Py_ssize_t val)
17+
cpdef get_item(self, float64_t val)
18+
cpdef set_item(self, float64_t key, Py_ssize_t val)
2019

2120
cdef class PyObjectHashTable(HashTable):
2221
cdef kh_pymap_t *table

pandas/hashtable.pyx

+25-6
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,6 @@ cdef class HashTable:
145145
cdef class StringHashTable(HashTable):
146146
cdef kh_str_t *table
147147

148-
# def __init__(self, size_hint=1):
149-
# if size_hint is not None:
150-
# kh_resize_str(self.table, size_hint)
151-
152148
def __cinit__(self, int size_hint=1):
153149
self.table = kh_init_str()
154150
if size_hint is not None:
@@ -539,8 +535,6 @@ cdef class Int64HashTable: #(HashTable):
539535

540536

541537
cdef class Float64HashTable(HashTable):
542-
# cdef kh_float64_t *table
543-
544538
def __cinit__(self, size_hint=1):
545539
self.table = kh_init_float64()
546540
if size_hint is not None:
@@ -549,9 +543,34 @@ cdef class Float64HashTable(HashTable):
549543
def __len__(self):
550544
return self.table.size
551545

546+
cpdef get_item(self, float64_t val):
547+
cdef khiter_t k
548+
k = kh_get_float64(self.table, val)
549+
if k != self.table.n_buckets:
550+
return self.table.vals[k]
551+
else:
552+
raise KeyError(val)
553+
554+
cpdef set_item(self, float64_t key, Py_ssize_t val):
555+
cdef:
556+
khiter_t k
557+
int ret = 0
558+
559+
k = kh_put_float64(self.table, key, &ret)
560+
self.table.keys[k] = key
561+
if kh_exist_float64(self.table, k):
562+
self.table.vals[k] = val
563+
else:
564+
raise KeyError(key)
565+
552566
def __dealloc__(self):
553567
kh_destroy_float64(self.table)
554568

569+
def __contains__(self, object key):
570+
cdef khiter_t k
571+
k = kh_get_float64(self.table, key)
572+
return k != self.table.n_buckets
573+
555574
def factorize(self, ndarray[float64_t] values):
556575
uniques = Float64Vector()
557576
labels = self.get_labels(values, uniques, 0, -1)

pandas/index.pyx

+3
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ cdef class Float64Engine(IndexEngine):
398398
cdef _make_hash_table(self, n):
399399
return _hash.Float64HashTable(n)
400400

401+
cdef _get_index_values(self):
402+
return algos.ensure_float64(self.vgetter())
403+
401404
def _call_monotonic(self, values):
402405
return algos.is_monotonic_float64(values)
403406

pandas/lib.pyx

+17-4
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,22 @@ cpdef map_indices_list(list index):
9292

9393
from libc.stdlib cimport malloc, free
9494

95+
96+
def ismember_nans(float64_t[:] arr, set values, bint hasnans):
97+
cdef:
98+
Py_ssize_t i, n
99+
ndarray[uint8_t] result
100+
float64_t val
101+
102+
n = len(arr)
103+
result = np.empty(n, dtype=np.uint8)
104+
for i in range(n):
105+
val = arr[i]
106+
result[i] = val in values or hasnans and isnan(val)
107+
108+
return result.view(np.bool_)
109+
110+
95111
def ismember(ndarray arr, set values):
96112
'''
97113
Checks whether
@@ -114,10 +130,7 @@ def ismember(ndarray arr, set values):
114130
result = np.empty(n, dtype=np.uint8)
115131
for i in range(n):
116132
val = util.get_value_at(arr, i)
117-
if val in values:
118-
result[i] = 1
119-
else:
120-
result[i] = 0
133+
result[i] = val in values
121134

122135
return result.view(np.bool_)
123136

pandas/tests/test_index.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def _skip_if_need_numpy_1_7():
3737
if _np_version_under1p7:
3838
raise nose.SkipTest('numpy >= 1.7 required')
3939

40+
4041
class TestIndex(tm.TestCase):
4142
_multiprocess_can_split_ = True
4243

@@ -835,15 +836,15 @@ def test_constructor(self):
835836
self.assertIsInstance(index, Float64Index)
836837
index = Float64Index(np.array([1.,2,3,4,5]))
837838
self.assertIsInstance(index, Float64Index)
838-
self.assertEqual(index.dtype, object)
839+
self.assertEqual(index.dtype, float)
839840

840841
index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32)
841842
self.assertIsInstance(index, Float64Index)
842-
self.assertEqual(index.dtype, object)
843+
self.assertEqual(index.dtype, np.float64)
843844

844845
index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32)
845846
self.assertIsInstance(index, Float64Index)
846-
self.assertEqual(index.dtype, object)
847+
self.assertEqual(index.dtype, np.float64)
847848

848849
# nan handling
849850
result = Float64Index([np.nan, np.nan])
@@ -904,6 +905,15 @@ def test_equals(self):
904905
i2 = Float64Index([1.0,np.nan])
905906
self.assertTrue(i.equals(i2))
906907

908+
def test_contains_nans(self):
909+
i = Float64Index([1.0, 2.0, np.nan])
910+
self.assertTrue(np.nan in i)
911+
912+
def test_contains_not_nans(self):
913+
i = Float64Index([1.0, 2.0, np.nan])
914+
self.assertTrue(1.0 in i)
915+
916+
907917
class TestInt64Index(tm.TestCase):
908918
_multiprocess_can_split_ = True
909919

0 commit comments

Comments
 (0)