BUG: hacks to support duplicate DataFrame columns in BlockManager, irow/icol with duplicates, ix[...] with dups too. close #1374

wesm · wesm · commit 2e95a0fe5638 · 2012-06-03T13:06:58.000-04:00
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -171,6 +171,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
 
         if columns is not None:
             self.columns = _ensure_index(columns)
+            self.frame = self.frame[self.columns]
         else:
             self.columns = frame.columns
 
@@ -196,7 +197,7 @@ def to_string(self, force_unicode=False):
 
             for i, c in enumerate(self.columns):
                 if self.header:
-                    fmt_values = self._format_col(c)
+                    fmt_values = self._format_col(i)
                     cheader = str_columns[i]
                     max_len = max(max(len(x) for x in fmt_values),
                                   max(len(x) for x in cheader))
@@ -208,9 +209,9 @@ def to_string(self, force_unicode=False):
                     stringified.append(_make_fixed_width(fmt_values,
                                                          self.justify))
                 else:
-                    stringified = [_make_fixed_width(self._format_col(c),
+                    stringified = [_make_fixed_width(self._format_col(i),
                                                      self.justify)
-                                   for c in self.columns]
+                                   for i, c in enumerate(self.columns)]
 
             if self.index:
                 to_write.append(adjoin(1, str_index, *stringified))
@@ -232,9 +233,10 @@ def to_string(self, force_unicode=False):
 
         self.buf.writelines(to_write)
 
-    def _format_col(self, col):
+    def _format_col(self, i):
+        col = self.columns[i]
         formatter = self.formatters.get(col)
-        return format_array(self.frame[col].values, formatter,
+        return format_array(self.frame.icol(i).values, formatter,
                             float_format=self.float_format,
                             na_rep=self.na_rep,
                             space=self.col_space)
@@ -329,8 +331,8 @@ def _maybe_bold_row(x):
                     return x
 
             fmt_values = {}
-            for col in self.columns:
-                fmt_values[col] = self._format_col(col)
+            for i in range(len(self.columns)):
+                fmt_values[i] = self._format_col(i)
 
             # write values
             for i in range(len(frame)):
@@ -339,8 +341,8 @@ def _maybe_bold_row(x):
                     row.extend(_maybe_bold_row(frame.index[i]))
                 else:
                     row.append(_maybe_bold_row(frame.index[i]))
-                for col in self.columns:
-                    row.append(fmt_values[col][i])
+                for j in range(len(self.columns)):
+                    row.append(fmt_values[j][i])
                 write_tr(row, indent, indent_delta)
             indent -= indent_delta
             write('</tbody>', indent)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1562,7 +1562,7 @@ def set_value(self, index, col, value):
 
             return result.set_value(index, col, value)
 
-    def irow(self, i):
+    def irow(self, i, copy=False):
         """
         Retrieve the i-th row or rows of the DataFrame by location
 
@@ -1585,7 +1585,12 @@ def irow(self, i):
             if isinstance(label, Index):
                 return self.reindex(label)
             else:
-                return self.xs(label)
+                try:
+                    new_values = self._data.fast_2d_xs(i, copy=copy)
+                except:
+                    new_values = self._data.fast_2d_xs(i, copy=True)
+                return Series(new_values, index=self.columns,
+                              name=self.index[i])
 
     def icol(self, i):
         """
@@ -1609,7 +1614,18 @@ def icol(self, i):
             lab_slice = slice(label[0], label[-1])
             return self.ix[:, lab_slice]
         else:
-            return self[label]
+            label = self.columns[i]
+            if isinstance(label, Index):
+                return self.reindex(columns=label)
+
+            values = self._data.iget(i)
+            return Series(values, index=self.index, name=label)
+
+    def _ixs(self, i, axis=0):
+        if axis == 0:
+            return self.irow(i)
+        else:
+            return self.icol(i)
 
     def iget_value(self, i, j):
         """
@@ -1714,7 +1730,12 @@ def _getitem_multilevel(self, key):
             return self._get_item_cache(key)
 
     def _box_item_values(self, key, values):
-        return Series(values, index=self.index, name=key)
+        if values.ndim == 2:
+            item_cols = self.columns[self.columns.get_loc(key)]
+            return DataFrame(values.T, columns=item_cols,
+                             index=self.index)
+        else:
+            return Series(values, index=self.index, name=key)
 
     def __getattr__(self, name):
         """After regular attribute access, try looking up the name of a column.
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -40,6 +40,9 @@ def _get_label(self, label, axis=0):
         except Exception:
             return self.obj.xs(label, axis=axis, copy=True)
 
+    def _get_loc(self, key, axis=0):
+        return self.obj._ixs(key, axis=axis)
+
     def _slice(self, obj, axis=0):
         return self.obj._slice(obj, axis=axis)
 
@@ -228,14 +231,14 @@ def _getitem_axis(self, key, axis=0):
                             raise
 
                 if not is_int_index:
-                    idx = labels[key]
+                    return self._get_loc(key, axis=0)
 
             return self._get_label(idx, axis=0)
         else:
             labels = self.obj._get_axis(axis)
             lab = key
             if com.is_integer(key) and not _is_integer_index(labels):
-                lab = labels[key]
+                return self._get_loc(key, axis=axis)
             return self._get_label(lab, axis=axis)
 
     def _getitem_iterable(self, key, axis=0):
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -554,7 +554,7 @@ def shape(self):
         return tuple(len(ax) for ax in self.axes)
 
     def _verify_integrity(self):
-        _union_block_items(self.blocks)
+        # _union_block_items(self.blocks)
         mgr_shape = self.shape
         for block in self.blocks:
             assert(block.ref_items is self.items)
@@ -631,14 +631,6 @@ def get_series_dict(self):
         # For DataFrame
         return _blocks_to_series_dict(self.blocks, self.axes[1])
 
-    @classmethod
-    def from_blocks(cls, blocks, index):
-        # also checks for overlap
-        items = _union_block_items(blocks)
-        for blk in blocks:
-            blk.ref_items = items
-        return BlockManager(blocks, [items, index])
-
     def __contains__(self, item):
         return item in self.items
 
@@ -783,6 +775,25 @@ def get(self, item):
         _, block = self._find_block(item)
         return block.get(item)
 
+    def iget(self, i):
+        item = self.items[i]
+        if self.items.is_unique:
+            return self.get(item)
+        else:
+            # ugh
+            inds, = (self.items == item).nonzero()
+
+            _, block = self._find_block(item)
+
+            binds, = (block.items == item).nonzero()
+
+            for j, (k, b) in enumerate(zip(inds, binds)):
+                if i == k:
+                    return block.values[b]
+
+            raise Exception('Cannot have duplicate column names '
+                            'split across dtypes')
+
     def get_scalar(self, tup):
         """
         Retrieve single item
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1095,6 +1095,22 @@ def test_icol(self):
         expected = df.reindex(columns=df.columns[[1, 2, 4, 6]])
         assert_frame_equal(result, expected)
 
+    def test_irow_icol_duplicates(self):
+        df = DataFrame(np.random.rand(3,3), columns=list('ABC'),
+                       index=list('aab'))
+
+        result = df.irow(0)
+        result2 = df.ix[0]
+        self.assert_(isinstance(result, Series))
+        assert_almost_equal(result.values, df.values[0])
+        assert_series_equal(result, result2)
+
+        result = df.T.icol(0)
+        result2 = df.T.ix[:, 0]
+        self.assert_(isinstance(result, Series))
+        assert_almost_equal(result.values, df.values[0])
+        assert_series_equal(result, result2)
+
     def test_iget_value(self):
         for i, row in enumerate(self.frame.index):
             for j, col in enumerate(self.frame.columns):
@@ -4490,12 +4506,6 @@ def test_rename(self):
             'C' : 'c',
             'D' : 'd'
         }
-        bad_mapping = {
-            'A' : 'a',
-            'B' : 'b',
-            'C' : 'b',
-            'D' : 'd'
-        }
 
         renamed = self.frame.rename(columns=mapping)
         renamed2 = self.frame.rename(columns=str.lower)
@@ -4504,9 +4514,6 @@ def test_rename(self):
         assert_frame_equal(renamed2.rename(columns=str.upper),
                            self.frame)
 
-        self.assertRaises(Exception, self.frame.rename,
-                          columns=bad_mapping)
-
         # index
 
         data = {
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -23,7 +23,7 @@ def get_float_mat(n, k):
 N = 10
 
 def get_float_ex(cols=['a', 'c', 'e']):
-    floats = get_float_mat(N, 3).T
+    floats = get_float_mat(N, len(cols)).T
     return make_block(floats, cols, TEST_COLS)
 
 def get_complex_ex(cols=['h']):
@@ -192,7 +192,15 @@ def setUp(self):
                        get_bool_ex(),
                        get_int_ex(),
                        get_complex_ex()]
-        self.mgr = BlockManager.from_blocks(self.blocks, np.arange(N))
+
+        all_items = [b.items for b in self.blocks]
+
+        items = sorted(all_items[0].append(all_items[1:]))
+        items = Index(items)
+        for b in self.blocks:
+            b.ref_items = items
+
+        self.mgr = BlockManager(self.blocks, [items, np.arange(N)])
 
     def test_constructor_corner(self):
         pass
@@ -204,8 +212,12 @@ def test_attrs(self):
     def test_is_mixed_dtype(self):
         self.assert_(self.mgr.is_mixed_dtype())
 
+        items = Index(['a', 'b'])
         blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
-        mgr = BlockManager.from_blocks(blocks, np.arange(N))
+        for b in blocks:
+            b.ref_items = items
+
+        mgr = BlockManager(blocks, [items,  np.arange(N)])
         self.assert_(not mgr.is_mixed_dtype())
 
     def test_is_indexed_like(self):
@@ -233,6 +245,15 @@ def test_union_block_items(self):
         self.assert_(np.array_equal(internals._union_block_items(blocks),
                                     ['a', 'b', 'c', 'd', 'e', 'f']))
 
+    def test_duplicate_item_failure(self):
+        items = Index(['a', 'a'])
+        blocks = [get_bool_ex(['a']), get_float_ex(['a'])]
+        for b in blocks:
+            b.ref_items = items
+
+        mgr = BlockManager(blocks, [items, np.arange(N)])
+        self.assertRaises(Exception, mgr.iget, 1)
+
     def test_contains(self):
         self.assert_('a' in self.mgr)
         self.assert_('baz' not in self.mgr)
@@ -288,27 +309,35 @@ def test_as_matrix(self):
         pass
 
     def test_as_matrix_int_bool(self):
+        items = Index(['a', 'b'])
+
         blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
+        for b in blocks:
+            b.ref_items = items
         index_sz = blocks[0].values.shape[1]
-        mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
+        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
         self.assert_(mgr.as_matrix().dtype == np.bool_)
 
         blocks = [get_int_ex(['a']), get_int_ex(['b'])]
-        mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
+        for b in blocks:
+            b.ref_items = items
+
+        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
         self.assert_(mgr.as_matrix().dtype == np.int64)
 
     def test_as_matrix_datetime(self):
+        items = Index(['h', 'g'])
         blocks = [get_dt_ex(['h']), get_dt_ex(['g'])]
+        for b in blocks:
+            b.ref_items = items
+
         index_sz = blocks[0].values.shape[1]
-        mgr = BlockManager.from_blocks(blocks, np.arange(index_sz))
+        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
         self.assert_(mgr.as_matrix().dtype == 'M8[ns]')
 
     def test_xs(self):
         pass
 
-    def test_from_blocks(self):
-        self.assert_(np.array_equal(self.mgr.items, TEST_COLS))
-
     def test_interleave(self):
         pass
 
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -505,8 +505,8 @@ def test_getitem_partial_column_select(self):
         expected = df.ix[('a', 'y')][[1, 0]]
         assert_frame_equal(result, expected)
 
-        key = (('a', 'foo'), slice(None, None, None))
-        self.assertRaises(KeyError, df.ix.__getitem__, key)
+        self.assertRaises(KeyError, df.ix.__getitem__,
+                          (('a', 'foo'), slice(None, None)))
 
     def test_sortlevel(self):
         df = self.frame.copy()