Skip to content

Commit 2e95a0f

Browse files
committed
BUG: hacks to support duplicate DataFrame columns in BlockManager, irow/icol with duplicates, ix[...] with dups too. close #1374
1 parent 9d28918 commit 2e95a0f

File tree

7 files changed

+117
-44
lines changed

7 files changed

+117
-44
lines changed

pandas/core/format.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
171171

172172
if columns is not None:
173173
self.columns = _ensure_index(columns)
174+
self.frame = self.frame[self.columns]
174175
else:
175176
self.columns = frame.columns
176177

@@ -196,7 +197,7 @@ def to_string(self, force_unicode=False):
196197

197198
for i, c in enumerate(self.columns):
198199
if self.header:
199-
fmt_values = self._format_col(c)
200+
fmt_values = self._format_col(i)
200201
cheader = str_columns[i]
201202
max_len = max(max(len(x) for x in fmt_values),
202203
max(len(x) for x in cheader))
@@ -208,9 +209,9 @@ def to_string(self, force_unicode=False):
208209
stringified.append(_make_fixed_width(fmt_values,
209210
self.justify))
210211
else:
211-
stringified = [_make_fixed_width(self._format_col(c),
212+
stringified = [_make_fixed_width(self._format_col(i),
212213
self.justify)
213-
for c in self.columns]
214+
for i, c in enumerate(self.columns)]
214215

215216
if self.index:
216217
to_write.append(adjoin(1, str_index, *stringified))
@@ -232,9 +233,10 @@ def to_string(self, force_unicode=False):
232233

233234
self.buf.writelines(to_write)
234235

235-
def _format_col(self, col):
236+
def _format_col(self, i):
237+
col = self.columns[i]
236238
formatter = self.formatters.get(col)
237-
return format_array(self.frame[col].values, formatter,
239+
return format_array(self.frame.icol(i).values, formatter,
238240
float_format=self.float_format,
239241
na_rep=self.na_rep,
240242
space=self.col_space)
@@ -329,8 +331,8 @@ def _maybe_bold_row(x):
329331
return x
330332

331333
fmt_values = {}
332-
for col in self.columns:
333-
fmt_values[col] = self._format_col(col)
334+
for i in range(len(self.columns)):
335+
fmt_values[i] = self._format_col(i)
334336

335337
# write values
336338
for i in range(len(frame)):
@@ -339,8 +341,8 @@ def _maybe_bold_row(x):
339341
row.extend(_maybe_bold_row(frame.index[i]))
340342
else:
341343
row.append(_maybe_bold_row(frame.index[i]))
342-
for col in self.columns:
343-
row.append(fmt_values[col][i])
344+
for j in range(len(self.columns)):
345+
row.append(fmt_values[j][i])
344346
write_tr(row, indent, indent_delta)
345347
indent -= indent_delta
346348
write('</tbody>', indent)

pandas/core/frame.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -1562,7 +1562,7 @@ def set_value(self, index, col, value):
15621562

15631563
return result.set_value(index, col, value)
15641564

1565-
def irow(self, i):
1565+
def irow(self, i, copy=False):
15661566
"""
15671567
Retrieve the i-th row or rows of the DataFrame by location
15681568
@@ -1585,7 +1585,12 @@ def irow(self, i):
15851585
if isinstance(label, Index):
15861586
return self.reindex(label)
15871587
else:
1588-
return self.xs(label)
1588+
try:
1589+
new_values = self._data.fast_2d_xs(i, copy=copy)
1590+
except:
1591+
new_values = self._data.fast_2d_xs(i, copy=True)
1592+
return Series(new_values, index=self.columns,
1593+
name=self.index[i])
15891594

15901595
def icol(self, i):
15911596
"""
@@ -1609,7 +1614,18 @@ def icol(self, i):
16091614
lab_slice = slice(label[0], label[-1])
16101615
return self.ix[:, lab_slice]
16111616
else:
1612-
return self[label]
1617+
label = self.columns[i]
1618+
if isinstance(label, Index):
1619+
return self.reindex(columns=label)
1620+
1621+
values = self._data.iget(i)
1622+
return Series(values, index=self.index, name=label)
1623+
1624+
def _ixs(self, i, axis=0):
1625+
if axis == 0:
1626+
return self.irow(i)
1627+
else:
1628+
return self.icol(i)
16131629

16141630
def iget_value(self, i, j):
16151631
"""
@@ -1714,7 +1730,12 @@ def _getitem_multilevel(self, key):
17141730
return self._get_item_cache(key)
17151731

17161732
def _box_item_values(self, key, values):
1717-
return Series(values, index=self.index, name=key)
1733+
if values.ndim == 2:
1734+
item_cols = self.columns[self.columns.get_loc(key)]
1735+
return DataFrame(values.T, columns=item_cols,
1736+
index=self.index)
1737+
else:
1738+
return Series(values, index=self.index, name=key)
17181739

17191740
def __getattr__(self, name):
17201741
"""After regular attribute access, try looking up the name of a column.

pandas/core/indexing.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ def _get_label(self, label, axis=0):
4040
except Exception:
4141
return self.obj.xs(label, axis=axis, copy=True)
4242

43+
def _get_loc(self, key, axis=0):
44+
return self.obj._ixs(key, axis=axis)
45+
4346
def _slice(self, obj, axis=0):
4447
return self.obj._slice(obj, axis=axis)
4548

@@ -228,14 +231,14 @@ def _getitem_axis(self, key, axis=0):
228231
raise
229232

230233
if not is_int_index:
231-
idx = labels[key]
234+
return self._get_loc(key, axis=0)
232235

233236
return self._get_label(idx, axis=0)
234237
else:
235238
labels = self.obj._get_axis(axis)
236239
lab = key
237240
if com.is_integer(key) and not _is_integer_index(labels):
238-
lab = labels[key]
241+
return self._get_loc(key, axis=axis)
239242
return self._get_label(lab, axis=axis)
240243

241244
def _getitem_iterable(self, key, axis=0):

pandas/core/internals.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ def shape(self):
554554
return tuple(len(ax) for ax in self.axes)
555555

556556
def _verify_integrity(self):
557-
_union_block_items(self.blocks)
557+
# _union_block_items(self.blocks)
558558
mgr_shape = self.shape
559559
for block in self.blocks:
560560
assert(block.ref_items is self.items)
@@ -631,14 +631,6 @@ def get_series_dict(self):
631631
# For DataFrame
632632
return _blocks_to_series_dict(self.blocks, self.axes[1])
633633

634-
@classmethod
635-
def from_blocks(cls, blocks, index):
636-
# also checks for overlap
637-
items = _union_block_items(blocks)
638-
for blk in blocks:
639-
blk.ref_items = items
640-
return BlockManager(blocks, [items, index])
641-
642634
def __contains__(self, item):
643635
return item in self.items
644636

@@ -783,6 +775,25 @@ def get(self, item):
783775
_, block = self._find_block(item)
784776
return block.get(item)
785777

778+
def iget(self, i):
779+
item = self.items[i]
780+
if self.items.is_unique:
781+
return self.get(item)
782+
else:
783+
# ugh
784+
inds, = (self.items == item).nonzero()
785+
786+
_, block = self._find_block(item)
787+
788+
binds, = (block.items == item).nonzero()
789+
790+
for j, (k, b) in enumerate(zip(inds, binds)):
791+
if i == k:
792+
return block.values[b]
793+
794+
raise Exception('Cannot have duplicate column names '
795+
'split across dtypes')
796+
786797
def get_scalar(self, tup):
787798
"""
788799
Retrieve single item

pandas/tests/test_frame.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,22 @@ def test_icol(self):
10951095
expected = df.reindex(columns=df.columns[[1, 2, 4, 6]])
10961096
assert_frame_equal(result, expected)
10971097

1098+
def test_irow_icol_duplicates(self):
1099+
df = DataFrame(np.random.rand(3,3), columns=list('ABC'),
1100+
index=list('aab'))
1101+
1102+
result = df.irow(0)
1103+
result2 = df.ix[0]
1104+
self.assert_(isinstance(result, Series))
1105+
assert_almost_equal(result.values, df.values[0])
1106+
assert_series_equal(result, result2)
1107+
1108+
result = df.T.icol(0)
1109+
result2 = df.T.ix[:, 0]
1110+
self.assert_(isinstance(result, Series))
1111+
assert_almost_equal(result.values, df.values[0])
1112+
assert_series_equal(result, result2)
1113+
10981114
def test_iget_value(self):
10991115
for i, row in enumerate(self.frame.index):
11001116
for j, col in enumerate(self.frame.columns):
@@ -4490,12 +4506,6 @@ def test_rename(self):
44904506
'C' : 'c',
44914507
'D' : 'd'
44924508
}
4493-
bad_mapping = {
4494-
'A' : 'a',
4495-
'B' : 'b',
4496-
'C' : 'b',
4497-
'D' : 'd'
4498-
}
44994509

45004510
renamed = self.frame.rename(columns=mapping)
45014511
renamed2 = self.frame.rename(columns=str.lower)
@@ -4504,9 +4514,6 @@ def test_rename(self):
45044514
assert_frame_equal(renamed2.rename(columns=str.upper),
45054515
self.frame)
45064516

4507-
self.assertRaises(Exception, self.frame.rename,
4508-
columns=bad_mapping)
4509-
45104517
# index
45114518

45124519
data = {

pandas/tests/test_internals.py

+38-9
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def get_float_mat(n, k):
2323
N = 10
2424

2525
def get_float_ex(cols=['a', 'c', 'e']):
26-
floats = get_float_mat(N, 3).T
26+
floats = get_float_mat(N, len(cols)).T
2727
return make_block(floats, cols, TEST_COLS)
2828

2929
def get_complex_ex(cols=['h']):
@@ -192,7 +192,15 @@ def setUp(self):
192192
get_bool_ex(),
193193
get_int_ex(),
194194
get_complex_ex()]
195-
self.mgr = BlockManager.from_blocks(self.blocks, np.arange(N))
195+
196+
all_items = [b.items for b in self.blocks]
197+
198+
items = sorted(all_items[0].append(all_items[1:]))
199+
items = Index(items)
200+
for b in self.blocks:
201+
b.ref_items = items
202+
203+
self.mgr = BlockManager(self.blocks, [items, np.arange(N)])
196204

197205
def test_constructor_corner(self):
198206
pass
@@ -204,8 +212,12 @@ def test_attrs(self):
204212
def test_is_mixed_dtype(self):
205213
self.assert_(self.mgr.is_mixed_dtype())
206214

215+
items = Index(['a', 'b'])
207216
blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
208-
mgr = BlockManager.from_blocks(blocks, np.arange(N))
217+
for b in blocks:
218+
b.ref_items = items
219+
220+
mgr = BlockManager(blocks, [items, np.arange(N)])
209221
self.assert_(not mgr.is_mixed_dtype())
210222

211223
def test_is_indexed_like(self):
@@ -233,6 +245,15 @@ def test_union_block_items(self):
233245
self.assert_(np.array_equal(internals._union_block_items(blocks),
234246
['a', 'b', 'c', 'd', 'e', 'f']))
235247

248+
def test_duplicate_item_failure(self):
249+
items = Index(['a', 'a'])
250+
blocks = [get_bool_ex(['a']), get_float_ex(['a'])]
251+
for b in blocks:
252+
b.ref_items = items
253+
254+
mgr = BlockManager(blocks, [items, np.arange(N)])
255+
self.assertRaises(Exception, mgr.iget, 1)
256+
236257
def test_contains(self):
237258
self.assert_('a' in self.mgr)
238259
self.assert_('baz' not in self.mgr)
@@ -288,27 +309,35 @@ def test_as_matrix(self):
288309
pass
289310

290311
def test_as_matrix_int_bool(self):
312+
items = Index(['a', 'b'])
313+
291314
blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
315+
for b in blocks:
316+
b.ref_items = items
292317
index_sz = blocks[0].values.shape[1]
293-
mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
318+
mgr = BlockManager(blocks, [items, np.arange(index_sz)])
294319
self.assert_(mgr.as_matrix().dtype == np.bool_)
295320

296321
blocks = [get_int_ex(['a']), get_int_ex(['b'])]
297-
mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
322+
for b in blocks:
323+
b.ref_items = items
324+
325+
mgr = BlockManager(blocks, [items, np.arange(index_sz)])
298326
self.assert_(mgr.as_matrix().dtype == np.int64)
299327

300328
def test_as_matrix_datetime(self):
329+
items = Index(['h', 'g'])
301330
blocks = [get_dt_ex(['h']), get_dt_ex(['g'])]
331+
for b in blocks:
332+
b.ref_items = items
333+
302334
index_sz = blocks[0].values.shape[1]
303-
mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
335+
mgr = BlockManager(blocks, [items, np.arange(index_sz)])
304336
self.assert_(mgr.as_matrix().dtype == 'M8[ns]')
305337

306338
def test_xs(self):
307339
pass
308340

309-
def test_from_blocks(self):
310-
self.assert_(np.array_equal(self.mgr.items, TEST_COLS))
311-
312341
def test_interleave(self):
313342
pass
314343

pandas/tests/test_multilevel.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -505,8 +505,8 @@ def test_getitem_partial_column_select(self):
505505
expected = df.ix[('a', 'y')][[1, 0]]
506506
assert_frame_equal(result, expected)
507507

508-
key = (('a', 'foo'), slice(None, None, None))
509-
self.assertRaises(KeyError, df.ix.__getitem__, key)
508+
self.assertRaises(KeyError, df.ix.__getitem__,
509+
(('a', 'foo'), slice(None, None)))
510510

511511
def test_sortlevel(self):
512512
df = self.frame.copy()

0 commit comments

Comments
 (0)