Skip to content

Commit 7200074

Browse files
committed
ENH: don't populate hash table in index engine if > 1e6 elements, to save memory and speed. close #1160
1 parent de66b56 commit 7200074

File tree

3 files changed

+62
-22
lines changed

3 files changed

+62
-22
lines changed

pandas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@
3636

3737
from pandas.tools.merge import merge, concat
3838
from pandas.tools.pivot import pivot_table, crosstab
39+
from pandas.tools.plotting import scatter_matrix
3940
from pandas.tools.describe import value_range

pandas/src/engines.pyx

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,27 @@ def get_value_at(ndarray arr, object loc):
4444
def set_value_at(ndarray arr, object loc, object val):
4545
return util.set_value_at(arr, loc, val)
4646

47+
48+
# Don't populate hash tables in monotonic indexes larger than this
49+
cdef int _SIZE_CUTOFF = 1000000
50+
51+
4752
cdef class IndexEngine:
4853

4954
cdef readonly:
5055
object index_weakref
5156
HashTable mapping
57+
bint over_size_threshold
5258

5359
cdef:
5460
bint unique, monotonic
5561
bint initialized, monotonic_check, unique_check
5662

5763
def __init__(self, index_weakref):
5864
self.index_weakref = index_weakref
65+
66+
self.over_size_threshold = len(index_weakref()) >= _SIZE_CUTOFF
67+
5968
self.initialized = 0
6069
self.monotonic_check = 0
6170

@@ -101,6 +110,15 @@ cdef class IndexEngine:
101110
if is_definitely_invalid_key(val):
102111
raise TypeError
103112

113+
if self.over_size_threshold and self.is_monotonic:
114+
if not self.is_unique:
115+
return self._get_loc_duplicates(val)
116+
values = self._get_index_values()
117+
loc = values.searchsorted(val, side='left')
118+
if util.get_value_at(values, loc) != val:
119+
raise KeyError(val)
120+
return loc
121+
104122
self._ensure_mapping_populated()
105123
if not self.unique:
106124
return self._get_loc_duplicates(val)
@@ -337,19 +355,17 @@ cdef class ObjectEngine(IndexEngine):
337355

338356
cdef class DatetimeEngine(Int64Engine):
339357

340-
# cdef Int64HashTable mapping
341-
342358
def __contains__(self, object val):
343-
self._ensure_mapping_populated()
344-
345-
if util.is_datetime64_object(val):
346-
return val.view('i8') in self.mapping
347-
348-
if PyDateTime_Check(val):
349-
key = np.datetime64(val)
350-
return key.view('i8') in self.mapping
359+
if self.over_size_threshold and self.is_monotonic:
360+
if not self.is_unique:
361+
return self._get_loc_duplicates(val)
362+
values = self._get_index_values()
363+
conv = _to_i8(val)
364+
loc = values.searchsorted(conv, side='left')
365+
return util.get_value_at(values, loc) == conv
351366

352-
return val in self.mapping
367+
self._ensure_mapping_populated()
368+
return _to_i8(val) in self.mapping
353369

354370
cdef _get_index_values(self):
355371
return self.index_weakref().values.view('i8')
@@ -363,13 +379,19 @@ cdef class DatetimeEngine(Int64Engine):
363379

364380
# Welcome to the spaghetti factory
365381

382+
if self.over_size_threshold and self.is_monotonic:
383+
if not self.is_unique:
384+
return self._get_loc_duplicates(val)
385+
values = self._get_index_values()
386+
conv = _to_i8(val)
387+
loc = values.searchsorted(conv, side='left')
388+
if util.get_value_at(values, loc) != conv:
389+
raise KeyError(val)
390+
return loc
391+
366392
self._ensure_mapping_populated()
367393
if not self.unique:
368-
if util.is_datetime64_object(val):
369-
val = val.view('i8')
370-
elif PyDateTime_Check(val):
371-
val = np.datetime64(val)
372-
val = val.view('i8')
394+
val = _to_i8(val)
373395
return self._get_loc_duplicates(val)
374396

375397
try:
@@ -380,11 +402,7 @@ cdef class DatetimeEngine(Int64Engine):
380402
pass
381403

382404
try:
383-
if util.is_datetime64_object(val):
384-
val = val.view('i8')
385-
elif PyDateTime_Check(val):
386-
val = np.datetime64(val)
387-
val = val.view('i8')
405+
val = _to_i8(val)
388406
return self.mapping.get_item(val)
389407
except TypeError:
390408
self._date_check_type(val)
@@ -417,6 +435,14 @@ cdef class DatetimeEngine(Int64Engine):
417435
limit=limit)
418436

419437

438+
cdef inline _to_i8(object val):
439+
if util.is_datetime64_object(val):
440+
val = unbox_datetime64_scalar(val)
441+
elif PyDateTime_Check(val):
442+
val = np.datetime64(val)
443+
val = unbox_datetime64_scalar(val)
444+
return val
445+
420446
# ctypedef fused idxvalue_t:
421447
# object
422448
# int

vb_suite/timeseries.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,24 @@
99
rng = date_range('1/1/2000', periods=N, freq='min')
1010
except NameError:
1111
rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute())
12-
date_range = DateRange
12+
def date_range(start=None, end=None, periods=None, freq=None):
13+
return DateRange(start, end, periods=periods, offset=freq)
1314
1415
ts = Series(np.random.randn(N), index=rng)
1516
"""
1617

18+
#----------------------------------------------------------------------
19+
# Lookup value in large time series, hash map population
20+
21+
setup = common_setup + """
22+
rng = date_range('1/1/2000', periods=1500000, freq='s')
23+
ts = Series(1, index=rng)
24+
"""
25+
26+
stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()"
27+
timeseries_large_lookup_value = Benchmark(stmt, setup,
28+
start_date=datetime(2012, 1, 1))
29+
1730
#----------------------------------------------------------------------
1831
# Test slice minutely series
1932

0 commit comments

Comments
 (0)