Skip to content

Commit 3633d26

Browse files
committed
ENH: to_panel refactor and retool HDFStore table reader to not have to create MultiIndex, fixes perf regression
1 parent a6e1bd4 commit 3633d26

File tree

5 files changed

+97
-52
lines changed

5 files changed

+97
-52
lines changed

pandas/core/frame.py

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -806,54 +806,30 @@ def to_panel(self):
806806
panel : Panel
807807
"""
808808
from pandas.core.panel import Panel
809-
810-
wide_shape = (len(self.columns), len(self.index.levels[0]),
811-
len(self.index.levels[1]))
809+
from pandas.core.reshape import block2d_to_block3d
812810

813811
# only support this kind for now
814812
assert(isinstance(self.index, MultiIndex) and
815813
len(self.index.levels) == 2)
816814

815+
self._consolidate_inplace()
816+
817817
major_axis, minor_axis = self.index.levels
818+
major_labels, minor_labels = self.index.labels
818819

819-
def make_mask(index):
820-
"""
821-
Create observation selection vector using major and minor
822-
labels, for converting to wide format.
823-
"""
824-
N, K = index.levshape
825-
selector = index.labels[1] + K * index.labels[0]
826-
mask = np.zeros(N * K, dtype=bool)
827-
mask.put(selector, True)
828-
return mask
829-
830-
def _to_wide_homogeneous():
831-
values = np.empty(wide_shape, dtype=self.values.dtype)
832-
if not issubclass(values.dtype.type, np.integer):
833-
values.fill(np.nan)
834-
835-
frame_values = self.values
836-
for i in xrange(len(self.columns)):
837-
values[i].flat[mask] = frame_values[:, i]
838-
return Panel(values, self.columns, major_axis, minor_axis)
839-
840-
def _to_wide_mixed():
841-
_, N, K = wide_shape
842-
# TODO: make much more efficient
843-
data = {}
844-
for item in self.columns:
845-
item_vals = self[item].values
846-
values = np.empty((N, K), dtype=item_vals.dtype)
847-
values.flat[mask] = item_vals
848-
data[item] = DataFrame(values, index=major_axis,
849-
columns=minor_axis)
850-
return Panel(data, self.columns, major_axis, minor_axis)
851-
852-
mask = make_mask(self.index)
853-
if self._is_mixed_type:
854-
return _to_wide_mixed()
855-
else:
856-
return _to_wide_homogeneous()
820+
shape = len(major_axis), len(minor_axis)
821+
822+
new_blocks = []
823+
for block in self._data.blocks:
824+
newb = block2d_to_block3d(block.values.T, block.items, shape,
825+
major_labels, minor_labels,
826+
ref_items=self.columns)
827+
new_blocks.append(newb)
828+
829+
new_axes = [self.columns, major_axis, minor_axis]
830+
new_mgr = BlockManager(new_blocks, new_axes)
831+
832+
return Panel(new_mgr)
857833

858834
to_wide = deprecate('to_wide', to_panel)
859835

pandas/core/index.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -998,11 +998,26 @@ def __getitem__(self, key):
998998
return np.ndarray.__getitem__(self, key)
999999

10001000
def unique_with_labels(values):
1001-
uniques = Index(lib.fast_unique(values))
1002-
labels = lib.get_unique_labels(values, uniques.indexMap)
1003-
uniques._cleanup()
1001+
rizer = lib.Factorizer(len(values))
1002+
labels, _ = rizer.factorize(values, sort=False)
1003+
uniques = Index(rizer.uniques)
1004+
1005+
sorter = uniques.argsort()
1006+
reverse_indexer = np.empty(len(sorter), dtype='i4')
1007+
reverse_indexer.put(sorter, np.arange(len(sorter)))
1008+
labels = reverse_indexer.take(labels)
1009+
uniques = uniques.take(sorter)
1010+
10041011
return uniques, labels
10051012

1013+
def unique_int64(values):
1014+
if values.dtype != np.int64:
1015+
values = values.astype('i8')
1016+
1017+
table = lib.Int64HashTable(len(values))
1018+
uniques = table.unique(values)
1019+
return uniques
1020+
10061021
class MultiIndex(Index):
10071022
"""
10081023
Implements multi-level, a.k.a. hierarchical, index object for pandas objects

pandas/core/reshape.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55

66
import numpy as np
77

8-
from pandas.core.frame import DataFrame
98
from pandas.core.series import Series
9+
from pandas.core.frame import DataFrame
10+
from pandas.core.panel import Panel
1011

1112
from pandas.core.common import notnull
1213
from pandas.core.groupby import get_group_index
@@ -565,3 +566,33 @@ def make_axis_dummies(frame, axis='minor', transform=None):
565566
values = values.take(labels, axis=0)
566567

567568
return DataFrame(values, columns=items, index=frame.index)
569+
570+
def block2d_to_block3d(values, items, shape, major_labels, minor_labels,
571+
ref_items=None):
572+
"""
573+
Developer method for pivoting DataFrame -> Panel. Used in HDFStore and
574+
DataFrame.to_panel
575+
"""
576+
from pandas.core.internals import make_block
577+
panel_shape = (len(items),) + shape
578+
579+
# TODO: lexsort depth needs to be 2!!
580+
581+
# Create observation selection vector using major and minor
582+
# labels, for converting to panel format.
583+
selector = minor_labels + shape[1] * major_labels
584+
mask = np.zeros(np.prod(shape), dtype=bool)
585+
mask.put(selector, True)
586+
587+
pvalues = np.empty(panel_shape, dtype=values.dtype)
588+
if not issubclass(pvalues.dtype.type, np.integer):
589+
pvalues.fill(np.nan)
590+
591+
values = values
592+
for i in xrange(len(items)):
593+
pvalues[i].flat[mask] = values[:, i]
594+
595+
if ref_items is None:
596+
ref_items = items
597+
598+
return make_block(pvalues, items, ref_items)

pandas/io/pytables.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,10 @@ def _read_frame_table(self, group, where=None):
624624
return self._read_panel_table(group, where)['value']
625625

626626
def _read_panel_table(self, group, where=None):
627+
from pandas.core.index import unique_int64, Factor
627628
from pandas.core.common import _asarray_tuplesafe
629+
from pandas.core.internals import BlockManager
630+
from pandas.core.reshape import block2d_to_block3d
628631

629632
table = getattr(group, 'table')
630633

@@ -637,19 +640,37 @@ def _read_panel_table(self, group, where=None):
637640
table._v_attrs.columns_kind)
638641
index = _maybe_convert(sel.values['index'],
639642
table._v_attrs.index_kind)
640-
# reconstruct
641-
long_index = MultiIndex.from_arrays([index, columns])
642-
lp = DataFrame(sel.values['values'], index=long_index,
643-
columns=fields)
643+
values = sel.values['values']
644644

645-
if not long_index.has_duplicates:
646-
lp = lp.sortlevel(level=0)
647-
wp = lp.to_panel()
645+
major = Factor(index)
646+
minor = Factor(columns)
647+
648+
J, K = len(major.levels), len(minor.levels)
649+
key = major.labels * K + minor.labels
650+
651+
if len(unique_int64(key)) == len(key):
652+
sorter, _ = lib.groupsort_indexer(key, J * K)
653+
654+
# the data need to be sorted
655+
sorted_values = values.take(sorter, axis=0)
656+
major_labels = major.labels.take(sorter)
657+
minor_labels = minor.labels.take(sorter)
658+
659+
block = block2d_to_block3d(sorted_values, fields, (J, K),
660+
major_labels, minor_labels)
661+
662+
mgr = BlockManager([block], [block.items,
663+
major.levels, minor.levels])
664+
wp = Panel(mgr)
648665
else:
649666
if not self._quiet: # pragma: no cover
650667
print ('Duplicate entries in table, taking most recently '
651668
'appended')
652669

670+
# reconstruct
671+
long_index = MultiIndex.from_arrays([index, columns])
672+
lp = DataFrame(values, index=long_index, columns=fields)
673+
653674
# need a better algorithm
654675
tuple_index = long_index.get_tuple_index()
655676
index_map = lib.map_indices_object(tuple_index)

pandas/src/hashtable.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,8 @@ cdef class Int64HashTable:
567567
khiter_t k
568568
list uniques = []
569569

570+
# TODO: kvec
571+
570572
for i in range(n):
571573
val = values[i]
572574
k = kh_get_int64(self.table, val)

0 commit comments

Comments
 (0)