Skip to content

Commit d38b36e

Browse files
committed
ENH: deep internal tinkering, cythonized cache_readonly, engines cython ext
1 parent 03fb204 commit d38b36e

File tree

9 files changed

+175
-308
lines changed

9 files changed

+175
-308
lines changed

pandas/core/daterange.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ def f(self, other):
2222
_CACHE_START = datetime(1950, 1, 1)
2323
_CACHE_END = datetime(2030, 1, 1)
2424

25+
_daterange_cache = {}
26+
2527
class DateRange(Index):
2628
"""
2729
Fixed frequency date range according to input parameters.
@@ -43,7 +45,6 @@ class DateRange(Index):
4345
tzinfo : pytz.timezone
4446
To endow DateRange with time zone information
4547
"""
46-
_cache = {}
4748
def __new__(cls, start=None, end=None, periods=None,
4849
offset=datetools.bday, time_rule=None,
4950
tzinfo=None, name=None, **kwds):
@@ -143,17 +144,17 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
143144
if offset is None:
144145
raise Exception('Must provide a DateOffset!')
145146

146-
if offset not in cls._cache:
147+
if offset not in _daterange_cache:
147148
xdr = generate_range(_CACHE_START, _CACHE_END, offset=offset)
148149
arr = np.array(list(xdr), dtype=object, copy=False)
149150

150151
cachedRange = arr.view(DateRange)
151152
cachedRange.offset = offset
152153
cachedRange.tzinfo = None
153154
cachedRange.name = None
154-
cls._cache[offset] = cachedRange
155+
_daterange_cache[offset] = cachedRange
155156
else:
156-
cachedRange = cls._cache[offset]
157+
cachedRange = _daterange_cache[offset]
157158

158159
if start is None:
159160
if end is None:
@@ -165,13 +166,13 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
165166

166167
end = offset.rollback(end)
167168

168-
endLoc = cachedRange.indexMap[end] + 1
169+
endLoc = cachedRange.get_loc(end) + 1
169170
startLoc = endLoc - periods
170171
elif end is None:
171172
assert(isinstance(start, datetime))
172173
start = offset.rollforward(start)
173174

174-
startLoc = cachedRange.indexMap[start]
175+
startLoc = cachedRange.get_loc(start)
175176
if periods is None:
176177
raise Exception('Must provide number of periods!')
177178

@@ -180,8 +181,8 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
180181
start = offset.rollforward(start)
181182
end = offset.rollback(end)
182183

183-
startLoc = cachedRange.indexMap[start]
184-
endLoc = cachedRange.indexMap[end] + 1
184+
startLoc = cachedRange.get_loc(start)
185+
endLoc = cachedRange.get_loc(end) + 1
185186

186187
indexSlice = cachedRange[startLoc:endLoc]
187188
indexSlice.name = name

pandas/core/frame.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,22 @@ def __array_wrap__(self, result):
865865
columns=self.columns, copy=False)
866866

867867
#----------------------------------------------------------------------
868-
# getitem/setitem related
868+
# Getting and setting elements
869+
870+
def get(self, index, col):
871+
"""
872+
Retrieve single value at passed column and index
873+
874+
Parameters
875+
----------
876+
index : row label
877+
col : column label
878+
879+
Returns
880+
-------
881+
element : scalar value
882+
"""
883+
return self._data.get_scalar((col, index))
869884

870885
def __getitem__(self, key):
871886
# slice rows

pandas/core/index.py

Lines changed: 19 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
_is_bool_indexer, _asarray_tuplesafe)
1010
from pandas.util.decorators import cache_readonly
1111
import pandas._tseries as lib
12+
import pandas._engines as _engines
1213

1314
__all__ = ['Index']
1415

@@ -93,19 +94,15 @@ def values(self):
9394
def is_monotonic(self):
9495
return self._is_monotonic(self.values)
9596

96-
_indexMap = None
97-
_integrity = False
98-
9997
@property
10098
def indexMap(self):
10199
"{label -> location}"
102-
if self._indexMap is None:
103-
self._indexMap = self._map_indices(self)
104-
self._integrity = len(self._indexMap) == len(self)
100+
return self._engine.get_mapping(1)
105101

106-
if not self._integrity:
107-
raise Exception('Index cannot contain duplicate values!')
108-
return self._indexMap
102+
@cache_readonly
103+
def _engine(self):
104+
# property, for now, slow to look up
105+
return _engines.DictIndexEngine(self.values, self._map_indices)
109106

110107
def _get_level_number(self, level):
111108
if not isinstance(level, int):
@@ -114,12 +111,7 @@ def _get_level_number(self, level):
114111
return level
115112

116113
def _verify_integrity(self):
117-
if self._indexMap is None:
118-
try:
119-
self.indexMap
120-
except Exception:
121-
return False
122-
return len(self.indexMap) == len(self)
114+
return self._engine.has_integrity
123115

124116
def _get_duplicates(self):
125117
from collections import defaultdict
@@ -139,7 +131,7 @@ def is_all_dates(self):
139131
return self._allDates
140132

141133
def __iter__(self):
142-
return iter(self.view(np.ndarray))
134+
return iter(self.values)
143135

144136
def __setstate__(self, state):
145137
"""Necessary for making this object picklable"""
@@ -152,7 +144,7 @@ def __deepcopy__(self, memo={}):
152144
return self
153145

154146
def __contains__(self, key):
155-
return key in self.indexMap
147+
return key in self._engine
156148

157149
def __hash__(self):
158150
return hash(self.view(np.ndarray))
@@ -238,7 +230,7 @@ def asof(self, label):
238230
For a sorted index, return the most recent label up to and including
239231
the passed label. Return NaN if not found
240232
"""
241-
if label not in self.indexMap:
233+
if label not in self:
242234
loc = self.searchsorted(label, side='left')
243235
if loc > 0:
244236
return self[loc-1]
@@ -409,7 +401,7 @@ def get_loc(self, key):
409401
-------
410402
loc : int
411403
"""
412-
return self.indexMap[key]
404+
return self._engine.get_loc(key)
413405

414406
def get_indexer(self, target, method=None):
415407
"""
@@ -570,14 +562,14 @@ def slice_locs(self, start=None, end=None):
570562
if start is None:
571563
beg_slice = 0
572564
elif start in self:
573-
beg_slice = self.indexMap[start]
565+
beg_slice = self.get_loc(start)
574566
else:
575567
beg_slice = self.searchsorted(start, side='left')
576568

577569
if end is None:
578570
end_slice = len(self)
579-
elif end in self.indexMap:
580-
end_slice = self.indexMap[end] + 1
571+
elif end in self:
572+
end_slice = self.get_loc(end) + 1
581573
else:
582574
end_slice = self.searchsorted(end, side='right')
583575

@@ -893,11 +885,6 @@ def __init__(self, levels, labels, sortorder=None, names=None,
893885
def dtype(self):
894886
return np.dtype('O')
895887

896-
def __iter__(self):
897-
values = [np.asarray(lev).take(lab)
898-
for lev, lab in zip(self.levels, self.labels)]
899-
return izip(*values)
900-
901888
def _get_level_number(self, level):
902889
if not isinstance(level, int):
903890
count = self.names.count(level)
@@ -912,9 +899,9 @@ def _get_level_number(self, level):
912899

913900
@property
914901
def values(self):
915-
result = np.empty(len(self), dtype=object)
916-
result[:] = list(self)
917-
return result
902+
values = [np.asarray(lev).take(lab)
903+
for lev, lab in zip(self.levels, self.labels)]
904+
return lib.fast_zip(values)
918905

919906
def get_level_values(self, level):
920907
"""
@@ -935,8 +922,7 @@ def get_level_values(self, level):
935922

936923
def __contains__(self, key):
937924
try:
938-
label_key = self._get_label_key(key)
939-
return label_key in self.indexMap
925+
return key in self.indexMap
940926
except Exception:
941927
return False
942928

@@ -1033,18 +1019,6 @@ def from_tuples(cls, tuples, sortorder=None, names=None):
10331019
return MultiIndex.from_arrays(arrays, sortorder=sortorder,
10341020
names=names)
10351021

1036-
@property
1037-
def indexMap(self):
1038-
if self._indexMap is None:
1039-
zipped = zip(*self.labels)
1040-
self._indexMap = lib.map_indices_list(zipped)
1041-
self._integrity = len(self._indexMap) == len(self)
1042-
1043-
if not self._integrity:
1044-
raise Exception('Index cannot contain duplicate values!')
1045-
1046-
return self._indexMap
1047-
10481022
@property
10491023
def nlevels(self):
10501024
return len(self.levels)
@@ -1399,7 +1373,7 @@ def get_loc(self, key):
13991373
"""
14001374
if isinstance(key, tuple):
14011375
if len(key) == self.nlevels:
1402-
return self._get_tuple_loc(key)
1376+
return self._engine.get_loc(key)
14031377
else:
14041378
result = slice(*self.slice_locs(key, key))
14051379
if result.start == result.stop:
@@ -1418,13 +1392,6 @@ def get_loc(self, key):
14181392
j = labels.searchsorted(loc, side='right')
14191393
return slice(i, j)
14201394

1421-
def _get_tuple_loc(self, tup):
1422-
indexer = self._get_label_key(tup)
1423-
try:
1424-
return self.indexMap[indexer]
1425-
except KeyError:
1426-
raise KeyError(str(tup))
1427-
14281395
def _get_label_key(self, tup):
14291396
return tuple(lev.get_loc(v) for lev, v in zip(self.levels, tup))
14301397

pandas/core/internals.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,7 @@ def shape(self):
315315
return tuple(len(ax) for ax in self.axes)
316316

317317
def _verify_integrity(self):
318-
union_items = _union_block_items(self.blocks)
319-
318+
_union_block_items(self.blocks)
320319
mgr_shape = self.shape
321320
for block in self.blocks:
322321
assert(block.values.shape[1:] == mgr_shape[1:])
@@ -520,6 +519,19 @@ def get(self, item):
520519
_, block = self._find_block(item)
521520
return block.get(item)
522521

522+
def get_scalar(self, tup):
523+
"""
524+
Retrieve single item
525+
"""
526+
item = tup[0]
527+
_, blk = self._find_block(item)
528+
529+
# this could obviously be seriously sped up in cython
530+
item_loc = blk.items.get_loc(item),
531+
full_loc = item_loc + tuple(ax.get_loc(x)
532+
for ax, x in zip(self.axes[1:], tup[1:]))
533+
return blk.values[full_loc]
534+
523535
def delete(self, item):
524536
i, _ = self._find_block(item)
525537
loc = self.items.get_loc(item)
@@ -548,6 +560,9 @@ def set(self, item, value):
548560
# insert at end
549561
self.insert(len(self.items), item, value)
550562

563+
def set_scalar(self, tup, value):
564+
pass
565+
551566
def insert(self, loc, item, value):
552567
if item in self.items:
553568
raise Exception('cannot insert %s, already exists' % item)
@@ -903,23 +918,15 @@ def _blocks_to_series_dict(blocks, index=None):
903918
return series_dict
904919

905920
def _interleaved_dtype(blocks):
906-
have_int = False
907-
have_bool = False
908-
have_object = False
909-
have_float = False
910-
911-
for block in blocks:
912-
if isinstance(block, FloatBlock):
913-
have_float = True
914-
elif isinstance(block, IntBlock):
915-
have_int = True
916-
elif isinstance(block, BoolBlock):
917-
have_bool = True
918-
elif isinstance(block, ObjectBlock):
919-
have_object = True
920-
else: # pragma: no cover
921-
raise Exception('Unrecognized block type')
922-
921+
from collections import defaultdict
922+
counts = defaultdict(lambda: 0)
923+
for x in blocks:
924+
counts[type(x)] += 1
925+
926+
have_int = counts[IntBlock] > 0
927+
have_bool = counts[BoolBlock] > 0
928+
have_object = counts[ObjectBlock] > 0
929+
have_float = counts[FloatBlock] > 0
923930
have_numeric = have_float or have_int
924931

925932
if have_object:

0 commit comments

Comments
 (0)