Skip to content

Commit 1134c9f

Browse files
committed
ENH: DataFrame.{iterrows, to_panel}, MultiIndex.{copy, has_duplicates}. More
testing for the LongPanel removal. Still lots of failing tests
1 parent a609574 commit 1134c9f

File tree

12 files changed

+224
-192
lines changed

12 files changed

+224
-192
lines changed

TODO.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
LongPanel removal
22
=================
33

4-
- level to flex methods
5-
- level to reindex
6-
- fast take for items
4+
- DONE level to flex methods
5+
- DONE level to reindex
6+
- ?? fast take for items
77

88

99
DONE

pandas/core/frame.py

+76
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,18 @@ def iteritems(self):
375375
"""Iterator over (column, series) pairs"""
376376
return ((k, self[k]) for k in self.columns)
377377

378+
def iterrows(self):
379+
"""
380+
Iterate over rows of DataFrame as (index, Series) pairs
381+
"""
382+
from itertools import izip
383+
columns = self.columns
384+
for k, v in izip(self.index, self.values):
385+
s = v.view(Series)
386+
s.index = columns
387+
s.name = k
388+
yield k, s
389+
378390
iterkv = iteritems
379391
if py3compat.PY3: # pragma: no cover
380392
items = iteritems
@@ -687,6 +699,70 @@ def to_sparse(self, fill_value=None, kind='block'):
687699
default_kind=kind,
688700
default_fill_value=fill_value)
689701

702+
def to_panel(self):
703+
"""
704+
Transform long (stacked) format (DataFrame) into wide (3D, Panel)
705+
format.
706+
707+
Currently the index of the DataFrame must be a 2-level MultiIndex. This
708+
may be generalized later
709+
710+
Returns
711+
-------
712+
panel : Panel
713+
"""
714+
from pandas.core.panel import Panel
715+
716+
wide_shape = (len(self.columns), len(self.index.levels[0]),
717+
len(self.index.levels[1]))
718+
719+
# only support this kind for now
720+
assert(isinstance(self.index, MultiIndex) and
721+
len(self.index.levels) == 2)
722+
723+
major_axis, minor_axis = self.index.levels
724+
725+
def make_mask(index):
726+
"""
727+
Create observation selection vector using major and minor
728+
labels, for converting to wide format.
729+
"""
730+
N, K = index.levshape
731+
selector = index.labels[1] + K * index.labels[0]
732+
mask = np.zeros(N * K, dtype=bool)
733+
mask.put(selector, True)
734+
return mask
735+
736+
def _to_wide_homogeneous():
737+
values = np.empty(wide_shape, dtype=self.values.dtype)
738+
if not issubclass(values.dtype.type, np.integer):
739+
values.fill(np.nan)
740+
741+
frame_values = self.values
742+
for i in xrange(len(self.columns)):
743+
values[i].flat[mask] = frame_values[:, i]
744+
return Panel(values, self.columns, major_axis, minor_axis)
745+
746+
def _to_wide_mixed():
747+
_, N, K = wide_shape
748+
# TODO: make much more efficient
749+
data = {}
750+
for item in self.columns:
751+
item_vals = self[item].values
752+
values = np.empty((N, K), dtype=item_vals.dtype)
753+
values.flat[mask] = item_vals
754+
data[item] = DataFrame(values, index=major_axis,
755+
columns=minor_axis)
756+
return Panel(data, self.columns, major_axis, minor_axis)
757+
758+
mask = make_mask(self.index)
759+
if self._is_mixed_type:
760+
return _to_wide_mixed()
761+
else:
762+
return _to_wide_homogeneous()
763+
764+
to_wide = deprecate('to_wide', to_panel)
765+
690766
def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
691767
index=True, index_label=None, mode='w', nanRep=None):
692768
"""

pandas/core/generic.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44

55
from pandas.core.common import save, load
6-
from pandas.core.index import _ensure_index
6+
from pandas.core.index import MultiIndex
77
import pandas.core.datetools as datetools
88

99
#-------------------------------------------------------------------------------
@@ -118,6 +118,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True):
118118
from pandas.core.groupby import groupby
119119
return groupby(self, by, axis=axis, level=level, as_index=as_index)
120120

121+
index = None
122+
121123
def truncate(self, before=None, after=None):
122124
"""Function truncate a sorted DataFrame / Series before and/or after
123125
some particular dates.
@@ -135,8 +137,27 @@ def truncate(self, before=None, after=None):
135137
"""
136138
before = datetools.to_datetime(before)
137139
after = datetools.to_datetime(after)
140+
141+
if before is not None and after is not None:
142+
assert(before <= after)
143+
138144
# returns view, want to copy
139-
return self.ix[before:after].copy()
145+
truncated = self.ix[before:after].copy()
146+
147+
# slice off chunks of level, adjust labels, a bit of an ugly hack to
148+
# get the unit tests to pass
149+
index = truncated.index
150+
if isinstance(index, MultiIndex):
151+
if index is self.index:
152+
index = self.index.copy()
153+
level = index.levels[0]
154+
start, end = level.slice_locs(before, after)
155+
index.levels[0] = level[start:end]
156+
index.labels[0] = index.labels[0] - start
157+
158+
truncated.index = index
159+
160+
return truncated
140161

141162
def select(self, crit, axis=0):
142163
"""

pandas/core/index.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,22 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None):
981981

982982
return subarr
983983

984+
def copy(self, order='C'):
985+
"""
986+
Overridden ndarray.copy to copy over attributes
987+
988+
Returns
989+
-------
990+
cp : Index
991+
Returns view on same base ndarray
992+
"""
993+
cp = self.view(np.ndarray).view(type(self))
994+
cp.levels = list(self.levels)
995+
cp.labels = list(self.labels)
996+
cp.names = list(self.names)
997+
cp.sortorder = self.sortorder
998+
return cp
999+
9841000
@property
9851001
def dtype(self):
9861002
return np.dtype('O')
@@ -1017,6 +1033,23 @@ def _has_complex_internals(self):
10171033
# to disable groupby tricks
10181034
return True
10191035

1036+
@property
1037+
def has_duplicates(self):
1038+
"""
1039+
Return True if there are no unique groups
1040+
"""
1041+
# has duplicates
1042+
shape = [len(lev) for lev in self.levels]
1043+
group_index = np.zeros(len(self), dtype='i8')
1044+
for i in xrange(len(shape)):
1045+
stride = np.prod([x for x in shape[i+1:]], dtype='i8')
1046+
group_index += self.labels[i] * stride
1047+
1048+
if len(np.unique(group_index)) < len(group_index):
1049+
return True
1050+
1051+
return False
1052+
10201053
def get_level_values(self, level):
10211054
"""
10221055
Return vector of label values for requested level, equal to the length
@@ -1179,7 +1212,7 @@ def __getitem__(self, key):
11791212

11801213
# an optimization
11811214
result = new_tuples.view(MultiIndex)
1182-
result.levels = self.levels
1215+
result.levels = list(self.levels)
11831216
result.labels = new_labels
11841217
result.sortorder = sortorder
11851218
result.names = self.names

pandas/core/panel.py

+20-82
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def f(self, other):
9090
'done with scalar values')
9191

9292
return self._combine(other, func)
93-
93+
f.__name__ = name
9494
return f
9595

9696
def _panel_arith_method(op, name):
@@ -1160,78 +1160,21 @@ def _get_join_index(self, other, how):
11601160
WidePanel = Panel
11611161
LongPanel = DataFrame
11621162

1163-
def panel_is_consistent(panel):
1164-
offset = max(len(panel.major_axis), len(panel.minor_axis))
1165-
major_labels = panel.major_labels.astype('i8')
1166-
minor_labels = panel.minor_labels.astype('i8')
1167-
keys = major_labels * offset + minor_labels
1168-
unique_keys = np.unique(keys)
1169-
1170-
if len(unique_keys) < len(keys):
1171-
return False
1172-
1173-
return True
1174-
1175-
def long_to_wide(lp):
1176-
"""
1177-
Transform long (stacked) format into wide format
1178-
1179-
Returns
1180-
-------
1181-
Panel
1182-
"""
1183-
assert(lp.consistent)
1184-
mask = make_mask(lp.index)
1185-
if lp._data.is_mixed_dtype():
1186-
return _to_wide_mixed(lp, mask)
1187-
else:
1188-
return _to_wide_homogeneous(lp, mask)
1189-
1190-
def _to_wide_homogeneous(lp, mask):
1191-
shape = _wide_shape(lp)
1192-
values = np.empty(shape, dtype=lp.values.dtype)
1193-
1194-
if not issubclass(lp.values.dtype.type, np.integer):
1195-
values.fill(np.nan)
1196-
1197-
for i in xrange(len(lp.items)):
1198-
values[i].flat[mask] = lp.values[:, i]
1199-
1200-
return Panel(values, lp.items, lp.major_axis, lp.minor_axis)
1201-
1202-
def _to_wide_mixed(lp, mask):
1203-
_, N, K = _wide_shape(lp)
1204-
1205-
# TODO: make much more efficient
1206-
1207-
data = {}
1208-
for i, item in enumerate(lp.items):
1209-
item_vals = lp[item].values
1210-
1211-
values = np.empty((N, K), dtype=item_vals.dtype)
1212-
values.ravel()[mask] = item_vals
1213-
data[item] = DataFrame(values, index=lp.major_axis,
1214-
columns=lp.minor_axis)
1215-
return Panel.from_dict(data)
1216-
1217-
def _wide_shape(lp):
1218-
return (len(lp.columns), len(lp.index.levels[0]), len(lp.index.levels[1]))
1219-
12201163
def long_swapaxes(frame):
12211164
"""
12221165
Swap major and minor axes and reorder values to be grouped by
12231166
minor axis values
12241167
12251168
Returns
12261169
-------
1227-
LongPanel (new object)
1170+
DataFrame (new object)
12281171
"""
12291172
return frame.swaplevel(0, 1, axis=0)
12301173

12311174

12321175
def long_truncate(lp, before=None, after=None):
12331176
"""
1234-
Slice panel between two major axis values, return complete LongPanel
1177+
Slice panel between two major axis values, return complete DataFrame
12351178
12361179
Parameters
12371180
----------
@@ -1242,7 +1185,7 @@ def long_truncate(lp, before=None, after=None):
12421185
12431186
Returns
12441187
-------
1245-
LongPanel
1188+
DataFrame
12461189
"""
12471190
left, right = lp.index.slice_locs(before, after)
12481191
new_index = lp.index.truncate(before, after)
@@ -1260,13 +1203,11 @@ def long_apply(lp, f, axis='major', broadcast=False):
12601203
f : function
12611204
NumPy function to apply to each group
12621205
axis : {'major', 'minor'}
1263-
12641206
broadcast : boolean
12651207
12661208
Returns
12671209
-------
1268-
broadcast=True -> LongPanel
1269-
broadcast=False -> DataFrame
1210+
applied : DataFrame
12701211
"""
12711212
try:
12721213
return lp._apply_level(f, axis=axis, broadcast=broadcast)
@@ -1278,8 +1219,8 @@ def long_apply(lp, f, axis='major', broadcast=False):
12781219

12791220
def make_dummies(frame, item):
12801221
"""
1281-
Use unique values in column of panel to construct LongPanel
1282-
containing dummy
1222+
Use unique values in column of panel to construct DataFrame containing
1223+
dummy variables in the columns (constructed from the unique values)
12831224
12841225
Parameters
12851226
----------
@@ -1288,15 +1229,15 @@ def make_dummies(frame, item):
12881229
12891230
Returns
12901231
-------
1291-
LongPanel
1232+
dummies : DataFrame
12921233
"""
12931234
from pandas import Factor
12941235
factor = Factor(frame[item].values)
12951236
values = np.eye(len(factor.levels))
12961237
dummy_mat = values.take(factor.labels, axis=0)
12971238
return DataFrame(dummy_mat, columns=factor.levels, index=frame.index)
12981239

1299-
def make_axis_dummies(frame, axis='minor'):
1240+
def make_axis_dummies(frame, axis='minor', transform=None):
13001241
"""
13011242
Construct 1-0 dummy variables corresponding to designated axis
13021243
labels
@@ -1308,19 +1249,27 @@ def make_axis_dummies(frame, axis='minor'):
13081249
Function to apply to axis labels first. For example, to
13091250
get "day of week" dummies in a time series regression you might
13101251
call:
1311-
panel.get_axis_dummies(axis='major',
1312-
transform=lambda d: d.weekday())
1252+
make_axis_dummies(panel, axis='major',
1253+
transform=lambda d: d.weekday())
13131254
Returns
13141255
-------
1315-
LongPanel, item names taken from chosen axis
1256+
dummies : DataFrame
1257+
Column names taken from chosen axis
13161258
"""
13171259
numbers = {
13181260
'major' : 0,
13191261
'minor' : 1
13201262
}
13211263
num = numbers.get(axis, axis)
1264+
13221265
items = frame.index.levels[num]
13231266
labels = frame.index.labels[num]
1267+
if transform is not None:
1268+
mapped_items = items.map(transform)
1269+
factor = Factor(mapped_items.take(labels))
1270+
labels = factor.labels
1271+
items = factor.levels
1272+
13241273
values = np.eye(len(items), dtype=float)
13251274
values = values.take(labels, axis=0)
13261275

@@ -1407,16 +1356,5 @@ def _get_distinct_indexes(indexes):
14071356
indexes = sorted(indexes, key=id)
14081357
return [gp.next() for _, gp in groupby(indexes, id)]
14091358

1410-
def make_mask(index):
1411-
"""
1412-
Create observation selection vector using major and minor
1413-
labels, for converting to wide format.
1414-
"""
1415-
N, K = index.levshape
1416-
selector = index.labels[1] + K * index.labels[0]
1417-
mask = np.zeros(N * K, dtype=bool)
1418-
mask.put(selector, True)
1419-
return mask
1420-
14211359
def _monotonic(arr):
14221360
return not (arr[1:] < arr[:-1]).any()

0 commit comments

Comments
 (0)