From bdf270c77ef8c84a04fcf25f805b56b00e6725c9 Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Fri, 9 Aug 2013 22:26:12 -0400 Subject: [PATCH 1/4] ENH: Make core/index exceptions more descriptive * `assert_copy`: You can use `assert_copy` to check that two iterables produce copies that are not the same object. (uses `assert_almost_equal` under the hood). * Fix assert_almost_equal to handle non-ndarrays (previously failed after iterable check) * Fix test_mixed_panel to reflect true name behavior --- doc/source/indexing.rst | 4 +- pandas/core/index.py | 52 ++++++++++--------- pandas/tests/test_frame.py | 11 ++-- pandas/tests/test_index.py | 103 +++++++++++++++++++++++++++---------- pandas/tests/test_panel.py | 3 +- pandas/util/testing.py | 16 +++++- 6 files changed, 130 insertions(+), 59 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 224925f144147..29a7f84cd1452 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1229,7 +1229,9 @@ However: :: >>> s.ix[('a', 'b'):('b', 'a')] - Exception: MultiIndex lexsort depth 1, key was length 2 + Traceback (most recent call last) + ... + KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) Swapping levels with ``swaplevel`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 33ea4d25bc7dc..15e9398bd0180 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -109,7 +109,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, **kwargs): subarr = com._asarray_tuplesafe(data, dtype=object) elif np.isscalar(data): - raise ValueError('Index(...) must be called with a collection ' + raise TypeError('Index(...) must be called with a collection ' 'of some kind, %s was passed' % repr(data)) else: # other iterable of some kind @@ -201,7 +201,7 @@ def _get_names(self): def _set_names(self, values): if len(values) != 1: - raise AssertionError('Length of new names must be 1, got %d' + raise ValueError('Length of new names must be 1, got %d' % len(values)) self.name = values[0] @@ -327,7 +327,7 @@ def __hash__(self): return hash(self.view(np.ndarray)) def __setitem__(self, key, value): - raise Exception(str(self.__class__) + ' object is immutable') + raise TypeError(str(self.__class__) + ' does not support item assignment') def __getitem__(self, key): """Override numpy.ndarray's __getitem__ method to work as desired""" @@ -513,7 +513,7 @@ def order(self, return_indexer=False, ascending=True): return sorted_index def sort(self, *args, **kwargs): - raise Exception('Cannot sort an Index object') + raise TypeError('Cannot sort an %r object' % self.__class__.__name__) def shift(self, periods=1, freq=None): """ @@ -572,7 +572,7 @@ def union(self, other): union : Index """ if not hasattr(other, '__iter__'): - raise Exception('Input must be iterable!') + raise TypeError('Input must be iterable.') if len(other) == 0 or self.equals(other): return self @@ -637,7 +637,7 @@ def intersection(self, other): intersection : Index """ if not hasattr(other, '__iter__'): - raise Exception('Input must be iterable!') + raise TypeError('Input must be iterable!') self._assert_can_do_setop(other) @@ -679,7 +679,7 @@ def diff(self, other): """ if not hasattr(other, '__iter__'): - raise Exception('Input must be iterable!') + raise TypeError('Input must be iterable!') if self.equals(other): return Index([], name=self.name) @@ -807,7 +807,7 @@ def get_indexer(self, target, method=None, limit=None): return this.get_indexer(target, method=method, limit=limit) if not self.is_unique: - raise Exception('Reindexing only valid with uniquely valued Index ' + raise InvalidIndexError('Reindexing only valid with uniquely valued Index ' 'objects') if method == 'pad': @@ -900,7 +900,7 @@ def reindex(self, target, method=None, level=None, limit=None, target = _ensure_index(target) if level is not None: if method is not None: - raise ValueError('Fill method not supported if level passed') + raise TypeError('Fill method not supported if level passed') _, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) else: @@ -1055,7 +1055,7 @@ def _join_level(self, other, level, how='left', return_indexers=False): the MultiIndex will not be changed (currently) """ if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise Exception('Join on level between two MultiIndex objects ' + raise TypeError('Join on level between two MultiIndex objects ' 'is ambiguous') left, right = self, other @@ -1429,11 +1429,10 @@ class MultiIndex(Index): def __new__(cls, levels=None, labels=None, sortorder=None, names=None): if len(levels) != len(labels): - raise AssertionError( + raise ValueError( 'Length of levels and labels must be the same') if len(levels) == 0: - raise Exception('Must pass non-zero number of levels/labels') - + raise TypeError('Must pass non-zero number of levels/labels') if len(levels) == 1: if names: name = names[0] @@ -1534,17 +1533,17 @@ def _get_level_number(self, level): try: count = self.names.count(level) if count > 1: - raise Exception('The name %s occurs multiple times, use a ' + raise ValueError('The name %s occurs multiple times, use a ' 'level number' % level) level = self.names.index(level) except ValueError: if not isinstance(level, int): - raise Exception('Level %s not found' % str(level)) + raise KeyError('Level %s not found' % str(level)) elif level < 0: level += self.nlevels # Note: levels are zero-based elif level >= self.nlevels: - raise ValueError('Index has only %d levels, not %d' + raise IndexError('Too many levels: Index has only %d levels, not %d' % (self.nlevels, level + 1)) return level @@ -1790,7 +1789,8 @@ def from_tuples(cls, tuples, sortorder=None, names=None): index : MultiIndex """ if len(tuples) == 0: - raise Exception('Cannot infer number of levels from empty list') + # I think this is right? Not quite sure... + raise TypeError('Cannot infer number of levels from empty list') if isinstance(tuples, np.ndarray): if isinstance(tuples, Index): @@ -2158,7 +2158,8 @@ def reindex(self, target, method=None, level=None, limit=None, """ if level is not None: if method is not None: - raise ValueError('Fill method not supported if level passed') + # FIXME: Should this actually be a TypeError [given that it's a signature issue] or ValueError + raise TypeError('Fill method not supported if level passed') target, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) else: @@ -2202,7 +2203,7 @@ def _tuple_index(self): def slice_locs(self, start=None, end=None, strict=False): """ For an ordered MultiIndex, compute the slice locations for input - labels. They can tuples representing partial levels, e.g. for a + labels. They can be tuples representing partial levels, e.g. for a MultiIndex with 3 levels, you can pass a single value (corresponding to the first level), or a 1-, 2-, or 3-tuple. @@ -2240,8 +2241,9 @@ def slice_locs(self, start=None, end=None, strict=False): def _partial_tup_index(self, tup, side='left'): if len(tup) > self.lexsort_depth: - raise KeyError('MultiIndex lexsort depth %d, key was length %d' % - (self.lexsort_depth, len(tup))) + raise KeyError('Key length (%d) was greater than MultiIndex' + ' lexsort depth (%d)' % + (len(tup), self.lexsort_depth)) n = len(tup) start, end = 0, len(self) @@ -2251,7 +2253,7 @@ def _partial_tup_index(self, tup, side='left'): if lab not in lev: if not lev.is_type_compatible(lib.infer_dtype([lab])): - raise Exception('Level type mismatch: %s' % lab) + raise TypeError('Level type mismatch: %s' % lab) # short circuit loc = lev.searchsorted(lab, side=side) @@ -2546,7 +2548,7 @@ def diff(self, other): try: other = MultiIndex.from_tuples(other) except: - raise TypeError("other should be a MultiIndex or a list of tuples") + raise TypeError("other must be a MultiIndex or a list of tuples") result_names = self.names else: result_names = self.names if self.names == other.names else None @@ -2588,7 +2590,7 @@ def insert(self, loc, item): if not isinstance(item, tuple): item = (item,) + ('',) * (self.nlevels - 1) elif len(item) != self.nlevels: - raise ValueError('Passed item incompatible tuple length') + raise ValueError('Item must have length equal to number of levels.') new_levels = [] new_labels = [] @@ -2691,7 +2693,7 @@ def _ensure_index(index_like): def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: - raise Exception('do not recognize join method %s' % method) + raise ValueError('do not recognize join method %s' % method) # TODO: handle index names! diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1b405eae08797..9e49d79de65a3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5920,14 +5920,15 @@ def test_corrwith_series(self): assert_series_equal(result, expected) def test_drop_names(self): - df = DataFrame([[1, 2, 3],[3, 4, 5],[5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) + df = DataFrame([[1, 2, 3],[3, 4, 5],[5, 6, 7]], index=['a', 'b', 'c'], + columns=['d', 'e', 'f']) df.index.name, df.columns.name = 'first', 'second' df_dropped_b = df.drop('b') df_dropped_e = df.drop('e', axis=1) - self.assert_(df_dropped_b.index.name == 'first') - self.assert_(df_dropped_e.index.name == 'first') - self.assert_(df_dropped_b.columns.name == 'second') - self.assert_(df_dropped_e.columns.name == 'second') + self.assertEqual(df_dropped_b.index.name, 'first') + self.assertEqual(df_dropped_e.index.name, 'first') + self.assertEqual(df_dropped_b.columns.name, 'second') + self.assertEqual(df_dropped_e.columns.name, 'second') def test_dropEmptyRows(self): N = len(self.frame.index) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index cc069a4da31e3..5942f8cd9626b 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -11,8 +11,8 @@ import numpy as np from numpy.testing import assert_array_equal -from pandas.core.index import Index, Int64Index, MultiIndex -from pandas.util.testing import assert_almost_equal +from pandas.core.index import Index, Int64Index, MultiIndex, InvalidIndexError +from pandas.util.testing import assert_almost_equal, assertRaisesRegexp from pandas import compat import pandas.util.testing as tm @@ -37,6 +37,14 @@ def setUp(self): self.empty = Index([]) self.tuples = Index(lzip(['foo', 'bar', 'baz'], [1, 2, 3])) + def test_wrong_number_names(self): + def testit(ind): + ind.names = ["apple", "banana", "carrot"] + + indices = (self.dateIndex, self.unicodeIndex, self.strIndex, self.intIndex, self.floatIndex, self.empty, self.tuples) + for ind in indices: + assertRaisesRegexp(ValueError, "^Length", testit, ind) + def test_hash_error(self): self.assertRaises(TypeError, hash, self.strIndex) @@ -56,10 +64,10 @@ def test_duplicates(self): self.assert_(not idx.is_unique) def test_sort(self): - self.assertRaises(Exception, self.strIndex.sort) + self.assertRaises(TypeError, self.strIndex.sort) def test_mutability(self): - self.assertRaises(Exception, self.strIndex.__setitem__, 0, 'foo') + self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo') def test_constructor(self): # regular instance creation @@ -85,7 +93,7 @@ def test_constructor(self): def test_constructor_corner(self): # corner case - self.assertRaises(Exception, Index, 0) + self.assertRaises(TypeError, Index, 0) def test_index_ctor_infer_periodindex(self): from pandas import period_range, PeriodIndex @@ -219,7 +227,7 @@ def test_intersection(self): self.assert_(inter is first) # non-iterable input - self.assertRaises(Exception, first.intersection, 0.5) + assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) def test_union(self): first = self.strIndex[5:20] @@ -239,7 +247,7 @@ def test_union(self): self.assert_(union is first) # non-iterable input - self.assertRaises(Exception, first.union, 0.5) + assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) # preserve names first.name = 'A' @@ -325,7 +333,7 @@ def test_diff(self): self.assertEqual(result.name, first.name) # non-iterable input - self.assertRaises(Exception, first.diff, 0.5) + assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) def test_pickle(self): def testit(index): @@ -571,6 +579,11 @@ class TestInt64Index(unittest.TestCase): def setUp(self): self.index = Int64Index(np.arange(0, 20, 2)) + def test_too_many_names(self): + def testit(): + self.index.names = ["roger", "harold"] + assertRaisesRegexp(ValueError, "^Length", testit) + def test_constructor(self): # pass list, coerce fine index = Int64Index([-5, 0, 1, 2]) @@ -969,7 +982,15 @@ def test_constructor_single_level(self): self.assert_(single_level.name is None) def test_constructor_no_levels(self): - self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + assertRaisesRegexp(TypeError, "non-zero number of levels/labels", + MultiIndex, levels=[], labels=[]) + + def test_constructor_mismatched_label_levels(self): + levels = [np.array([1]), np.array([2]), np.array([3])] + labels = ["a"] + assertRaisesRegexp(ValueError, "Length of levels and labels must be" + " the same", MultiIndex, levels=levels, + labels=labels) def test_copy(self): i_copy = self.index.copy() @@ -1017,15 +1038,16 @@ def test_view(self): def test_duplicate_names(self): self.index.names = ['foo', 'foo'] - self.assertRaises(Exception, self.index._get_level_number, 'foo') + assertRaisesRegexp(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def test_get_level_number_integer(self): self.index.names = [1, 0] self.assertEqual(self.index._get_level_number(1), 0) self.assertEqual(self.index._get_level_number(0), 1) - self.assertRaises(Exception, self.index._get_level_number, 2) - - self.assertRaises(Exception, self.index._get_level_number, 'fourth') + self.assertRaises(IndexError, self.index._get_level_number, 2) + assertRaisesRegexp(KeyError, 'Level fourth not found', + self.index._get_level_number, 'fourth') def test_from_arrays(self): arrays = [] @@ -1060,8 +1082,8 @@ def test_get_level_values(self): def test_reorder_levels(self): # this blows up - self.assertRaises(Exception, self.index.reorder_levels, - [2, 1, 0]) + assertRaisesRegexp(IndexError, '^Too many levels', + self.index.reorder_levels, [2, 1, 0]) def test_nlevels(self): self.assertEquals(self.index.nlevels, 2) @@ -1234,6 +1256,22 @@ def test_slice_locs(self): expected = df[6:15].stack() tm.assert_almost_equal(sliced.values, expected.values) + def test_slice_locs_with_type_mismatch(self): + df = tm.makeTimeDataFrame() + stacked = df.stack() + idx = stacked.index + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, + (1, 3)) + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, + df.index[5] + timedelta(seconds=30), (5, 2)) + df = tm.makeCustomDataframe(5, 5) + stacked = df.stack() + idx = stacked.index + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, timedelta(seconds=30)) + # TODO: Try creating a UnicodeDecodeError in exception message + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, + df.index[1], (16, "a")) + def test_slice_locs_not_sorted(self): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), @@ -1242,8 +1280,9 @@ def test_slice_locs_not_sorted(self): np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) - self.assertRaises(Exception, index.slice_locs, (1, 0, 1), - (2, 1, 0)) + assertRaisesRegexp(KeyError, "[Kk]ey length.*greater than MultiIndex" + " lexsort depth", index.slice_locs, (1, 0, 1), + (2, 1, 0)) # works sorted_index, _ = index.sortlevel(0) @@ -1369,6 +1408,12 @@ def test_get_indexer(self): r1 = idx1.get_indexer([1, 2, 3]) self.assert_((r1 == [-1, -1, -1]).all()) + # create index with duplicates + idx1 = Index(lrange(10) + lrange(10)) + idx2 = Index(range(20)) + assertRaisesRegexp(InvalidIndexError, "Reindexing only valid with" + " uniquely valued Index objects", + idx1.get_indexer, idx2) def test_format(self): self.index.format() @@ -1564,9 +1609,12 @@ def test_diff(self): ('qux', 'one'), ('qux', 'two')]) expected.names = first.names self.assertEqual(first.names, result.names) + assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list" + " of tuples", first.diff, [1,2,3,4,5]) def test_from_tuples(self): - self.assertRaises(Exception, MultiIndex.from_tuples, []) + assertRaisesRegexp(TypeError, 'Cannot infer number of levels from' + ' empty list', MultiIndex.from_tuples, []) idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) self.assertEquals(len(idx), 2) @@ -1638,8 +1686,8 @@ def test_drop(self): self.assert_(dropped.equals(expected)) index = MultiIndex.from_tuples([('bar', 'two')]) - self.assertRaises(Exception, self.index.drop, [('bar', 'two')]) - self.assertRaises(Exception, self.index.drop, index) + self.assertRaises(KeyError, self.index.drop, [('bar', 'two')]) + self.assertRaises(KeyError, self.index.drop, index) # mixed partial / full drop dropped = self.index.drop(['foo', ('qux', 'one')]) @@ -1693,7 +1741,8 @@ def test_insert(self): self.assert_(new_index[0] == ('abc', 'three')) # key wrong length - self.assertRaises(Exception, self.index.insert, 0, ('foo2',)) + assertRaisesRegexp(ValueError, "Item must have length equal to number" + " of levels", self.index.insert, 0, ('foo2',)) def test_take_preserve_name(self): taken = self.index.take([3, 0, 1]) @@ -1740,7 +1789,8 @@ def _check_all(other): result = idx.join(self.index, level='second') tm.assert_isinstance(result, MultiIndex) - self.assertRaises(Exception, self.index.join, self.index, level=1) + assertRaisesRegexp(TypeError, "Join.*MultiIndex.*ambiguous", + self.index.join, self.index, level=1) def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' @@ -1774,11 +1824,12 @@ def test_reindex_level(self): exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) self.assert_(np.array_equal(indexer2, exp_indexer2)) - self.assertRaises(ValueError, self.index.reindex, - self.index, method='pad', level='second') + assertRaisesRegexp(TypeError, "Fill method not supported", + self.index.reindex, self.index, method='pad', + level='second') - self.assertRaises(ValueError, idx.reindex, - idx, method='bfill', level='first') + assertRaisesRegexp(TypeError, "Fill method not supported", + idx.reindex, idx, method='bfill', level='first') def test_has_duplicates(self): self.assert_(not self.index.has_duplicates) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 3d2a5f2e58ded..e8b50e21f694a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1255,7 +1255,8 @@ def test_to_frame_mixed(self): lp = panel.to_frame() wp = lp.to_panel() self.assertEqual(wp['bool'].values.dtype, np.bool_) - assert_frame_equal(wp['bool'], panel['bool']) + # Previously, this was mutating the underlying index and changing its name + assert_frame_equal(wp['bool'], panel['bool'], check_names=False) def test_to_panel_na_handling(self): df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 82fdf45265e78..8af88895a8b73 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -176,7 +176,9 @@ def assert_almost_equal(a, b, check_less_precise=False): np.testing.assert_(isiterable(b)) na, nb = len(a), len(b) assert na == nb, "%s != %s" % (na, nb) - + # TODO: Figure out why I thought this needed instance cheacks... + # if (isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and + # np.array_equal(a, b)): if np.array_equal(a, b): return True else: @@ -321,6 +323,18 @@ def assert_contains_all(iterable, dic): for k in iterable: assert k in dic, "Did not contain item: '%r'" % k +def assert_copy(iter1, iter2, **eql_kwargs): + """ + iter1, iter2: iterables that produce elements comparable with assert_almost_equal + + Checks that the elements are equal, but not the same object. (Does not + check that items in sequences are also not the same object) + """ + for elem1, elem2 in zip(iter1, iter2): + assert_almost_equal(elem1, elem2, **eql_kwargs) + assert elem1 is not elem2, "Expected object %r and object %r to be different objects, were same." % ( + type(elem1), type(elem2)) + def getCols(k): return string.ascii_uppercase[:k] From 285622f74019ea97240747acf69b9df459b9414e Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Sat, 29 Jun 2013 10:17:27 -0400 Subject: [PATCH 2/4] ENH: Index inherits from FrozenNDArray + add FrozenList * `FrozenNDArray` - thin wrapper around ndarray that disallows setting methods (will be used for levels on `MultiIndex`) * `FrozenList` - thin wrapper around list that disallows setting methods (needed because of type checks elsewhere) Index inherits from FrozenNDArray now and also actually copies for deepcopy. Assumption is that underlying array is still immutable-ish --- pandas/core/base.py | 88 +++++++++++++++++++++++++- pandas/core/common.py | 4 +- pandas/core/index.py | 9 +-- pandas/io/tests/test_parsers.py | 14 +++-- pandas/tests/test_base.py | 108 ++++++++++++++++++++++++++++++++ pandas/tests/test_common.py | 10 +-- pandas/tests/test_index.py | 3 +- 7 files changed, 215 insertions(+), 21 deletions(-) create mode 100644 pandas/tests/test_base.py diff --git a/pandas/core/base.py b/pandas/core/base.py index 16fe28a804b6b..e635844248371 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,7 +1,8 @@ """ -Base class(es) for all pandas objects. +Base and utility classes for pandas objects. """ from pandas import compat +import numpy as np class StringMixin(object): """implements string methods so long as object defines a `__unicode__` method. @@ -56,3 +57,88 @@ def __unicode__(self): """ # Should be overwritten by base classes return object.__repr__(self) + +class FrozenList(PandasObject, list): + """ + Container that doesn't allow setting item *but* + because it's technically non-hashable, will be used + for lookups, appropriately, etc. + """ + # Sidenote: This has to be of type list, otherwise it messes up PyTables typechecks + + def __add__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(super(FrozenList, self).__add__(other)) + + __iadd__ = __add__ + + # Python 2 compat + def __getslice__(self, i, j): + return self.__class__(super(FrozenList, self).__getslice__(i, j)) + + def __getitem__(self, n): + # Python 3 compat + if isinstance(n, slice): + return self.__class__(super(FrozenList, self).__getitem__(n)) + return super(FrozenList, self).__getitem__(n) + + def __radd__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(other + list(self)) + + def __eq__(self, other): + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super(FrozenList, self).__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other): + return self.__class__(super(FrozenList, self).__mul__(other)) + + __imul__ = __mul__ + + def __hash__(self): + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__) + + def __unicode__(self): + from pandas.core.common import pprint_thing + return "%s(%s)" % (self.__class__.__name__, + pprint_thing(self, quote_strings=True, + escape_chars=('\t', '\r', '\n'))) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + pop = append = extend = remove = sort = insert = _disabled + + +class FrozenNDArray(PandasObject, np.ndarray): + + # no __array_finalize__ for now because no metadata + def __new__(cls, data, dtype=None, copy=False): + if copy is None: + copy = not isinstance(data, FrozenNDArray) + res = np.array(data, dtype=dtype, copy=copy).view(cls) + return res + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + put = itemset = fill = _disabled + + def _shallow_copy(self): + return self.view() + + def values(self): + """returns *copy* of underlying array""" + arr = self.view(np.ndarray).copy() + return arr diff --git a/pandas/core/common.py b/pandas/core/common.py index 06ca3be455f2a..9a90c66902376 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -8,18 +8,16 @@ from numpy.lib.format import read_array, write_array import numpy as np - import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib from pandas import compat from pandas.compat import StringIO, BytesIO, range, long, u, zip, map - - from pandas.core.config import get_option from pandas.core import array as pa + # XXX: HACK for NumPy 1.5.1 to suppress warnings try: np.seterr(all='ignore') diff --git a/pandas/core/index.py b/pandas/core/index.py index 15e9398bd0180..3e24788ec411d 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -9,7 +9,7 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp -from pandas.core.base import PandasObject +from pandas.core.base import FrozenList, FrozenNDArray from pandas.util.decorators import cache_readonly from pandas.core.common import isnull @@ -47,7 +47,7 @@ def _shouldbe_timestamp(obj): or tslib.is_timestamp_array(obj)) -class Index(PandasObject, np.ndarray): +class Index(FrozenNDArray): """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects @@ -313,7 +313,7 @@ def __deepcopy__(self, memo={}): """ Index is not mutable, so disabling deepcopy """ - return self + return self._shallow_copy() def __contains__(self, key): hash(key) @@ -326,9 +326,6 @@ def __contains__(self, key): def __hash__(self): return hash(self.view(np.ndarray)) - def __setitem__(self, key, value): - raise TypeError(str(self.__class__) + ' does not support item assignment') - def __getitem__(self, key): """Override numpy.ndarray's __getitem__ method to work as desired""" arr_idx = self.view(np.ndarray) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index d83fbd97b6044..d9a4196a6efb3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -817,7 +817,11 @@ def test_parse_dates_column_list(self): expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) lev = expected.index.levels[0] - expected.index.levels[0] = lev.to_datetime(dayfirst=True) + levels = list(expected.index.levels) + levels[0] = lev.to_datetime(dayfirst=True) + # hack to get this to work - remove for final test + levels[0].name = lev.name + expected.index.levels = levels expected['aux_date'] = to_datetime(expected['aux_date'], dayfirst=True) expected['aux_date'] = lmap(Timestamp, expected['aux_date']) @@ -2144,14 +2148,14 @@ def test_usecols_dtypes(self): 4,5,6 7,8,9 10,11,12""" - result = self.read_csv(StringIO(data), usecols=(0, 1, 2), - names=('a', 'b', 'c'), + result = self.read_csv(StringIO(data), usecols=(0, 1, 2), + names=('a', 'b', 'c'), header=None, converters={'a': str}, dtype={'b': int, 'c': float}, - ) + ) result2 = self.read_csv(StringIO(data), usecols=(0, 2), - names=('a', 'b', 'c'), + names=('a', 'b', 'c'), header=None, converters={'a': str}, dtype={'b': int, 'c': float}, diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py new file mode 100644 index 0000000000000..c6285bc95b855 --- /dev/null +++ b/pandas/tests/test_base.py @@ -0,0 +1,108 @@ +import re +import unittest +import numpy as np +from pandas.core.base import FrozenList, FrozenNDArray +from pandas.util.testing import assertRaisesRegexp, assert_isinstance + + +class CheckImmutable(object): + mutable_regex = re.compile('does not support mutable operations') + + def check_mutable_error(self, *args, **kwargs): + # pass whatever functions you normally would to assertRaises (after the Exception kind) + assertRaisesRegexp(TypeError, self.mutable_regex, *args, **kwargs) + + def test_no_mutable_funcs(self): + def setitem(): self.container[0] = 5 + + self.check_mutable_error(setitem) + + def setslice(): self.container[1:2] = 3 + + self.check_mutable_error(setslice) + + def delitem(): del self.container[0] + + self.check_mutable_error(delitem) + + def delslice(): del self.container[0:3] + + self.check_mutable_error(delslice) + mutable_methods = getattr(self, "mutable_methods", []) + for meth in mutable_methods: + self.check_mutable_error(getattr(self.container, meth)) + + def test_slicing_maintains_type(self): + result = self.container[1:2] + expected = self.lst[1:2] + self.check_result(result, expected) + + def check_result(self, result, expected, klass=None): + klass = klass or self.klass + assert_isinstance(result, klass) + self.assertEqual(result, expected) + + +class TestFrozenList(CheckImmutable, unittest.TestCase): + mutable_methods = ('extend', 'pop', 'remove', 'insert') + + def setUp(self): + self.lst = [1, 2, 3, 4, 5] + self.container = FrozenList(self.lst) + self.klass = FrozenList + + def test_add(self): + result = self.container + (1, 2, 3) + expected = FrozenList(self.lst + [1, 2, 3]) + self.check_result(result, expected) + + result = (1, 2, 3) + self.container + expected = FrozenList([1, 2, 3] + self.lst) + self.check_result(result, expected) + + def test_inplace(self): + q = r = self.container + q += [5] + self.check_result(q, self.lst + [5]) + # other shouldn't be mutated + self.check_result(r, self.lst) + + +class TestFrozenNDArray(CheckImmutable, unittest.TestCase): + mutable_methods = ('put', 'itemset', 'fill') + + def setUp(self): + self.lst = [3, 5, 7, -2] + self.container = FrozenNDArray(self.lst) + self.klass = FrozenNDArray + + def test_shallow_copying(self): + original = self.container.copy() + assert_isinstance(self.container.view(), FrozenNDArray) + self.assert_(not isinstance(self.container.view(np.ndarray), FrozenNDArray)) + self.assert_(self.container.view() is not self.container) + self.assert_(np.array_equal(self.container, original)) + # shallow copy should be the same too + assert_isinstance(self.container._shallow_copy(), FrozenNDArray) + # setting should not be allowed + def testit(container): container[0] = 16 + + self.check_mutable_error(testit, self.container) + + def test_values(self): + original = self.container.view(np.ndarray).copy() + n = original[0] + 15 + vals = self.container.values() + self.assert_(np.array_equal(original, vals)) + self.assert_(original is not vals) + vals[0] = n + self.assert_(np.array_equal(self.container, original)) + self.assertEqual(vals[0], n) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index abed2818cb864..3660b31b912fc 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,22 +1,21 @@ from datetime import datetime import re +import unittest import nose from nose.tools import assert_equal import unittest +import numpy as np +from pandas.tslib import iNaT from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull +import pandas.compat as compat import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf -import numpy as np - -from pandas.tslib import iNaT -from pandas import compat - _multiprocess_can_split_ = True @@ -782,6 +781,7 @@ def test_2d_datetime64(self): expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 5942f8cd9626b..7f889f464916e 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -57,7 +57,8 @@ def test_deepcopy(self): from copy import deepcopy copy = deepcopy(self.strIndex) - self.assert_(copy is self.strIndex) + self.assert_(copy is not self.strIndex) + self.assert_(copy.equals(self.strIndex)) def test_duplicates(self): idx = Index([0, 0, 0]) From 033a9329f2a35d842b951410bdf411eb881d6df5 Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Fri, 28 Jun 2013 18:07:12 -0400 Subject: [PATCH 3/4] BUG/ENH: Make names, levels and labels properties. * `names` is now a property *and* is set up as an immutable tuple. * `levels` are always (shallow) copied now and it is deprecated to set directly * `labels` are set up as a property now, moving all the processing of labels out of `__new__` + shallow-copied. * `levels` and `labels` are immutable. * Add names tests, motivating example from #3742, reflect tuple-ish output from names, and level names check to reindex test. * Add set_levels, set_labels, set_names and rename to index * Deprecate setting labels and levels directly Similar to other set_* methods...allows mutation if necessary but otherwise returns same object. Labels are now converted to `FrozenNDArray` and wrapped in a `FrozenList`. Should mostly resolve #3714 because you have to work to actually make assignments to an `Index`. BUG: Give MultiIndex its own astype method Fixes issue with set_value forgetting names. --- doc/source/indexing.rst | 144 ++++++++++-------- doc/source/release.rst | 23 +++ doc/source/v0.13.0.txt | 18 +++ pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/index.py | 250 ++++++++++++++++++++++++++----- pandas/core/reshape.py | 8 +- pandas/io/parsers.py | 2 +- pandas/io/tests/test_excel.py | 2 +- pandas/io/tests/test_parsers.py | 4 +- pandas/io/tests/test_pytables.py | 2 +- pandas/tests/test_frame.py | 4 +- pandas/tests/test_groupby.py | 12 +- pandas/tests/test_index.py | 161 ++++++++++++++------ pandas/tests/test_multilevel.py | 10 +- pandas/tests/test_panel.py | 2 +- pandas/tests/test_panel4d.py | 2 +- pandas/tools/pivot.py | 2 +- pandas/tools/tests/test_merge.py | 12 +- pandas/tools/tests/test_pivot.py | 4 +- pandas/util/decorators.py | 4 +- 21 files changed, 493 insertions(+), 177 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 29a7f84cd1452..0cfc954e38f98 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -868,66 +868,6 @@ convert to an integer index: df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] -.. _indexing.class: - -Index objects -------------- - -The pandas Index class and its subclasses can be viewed as implementing an -*ordered set* in addition to providing the support infrastructure necessary for -lookups, data alignment, and reindexing. The easiest way to create one directly -is to pass a list or other sequence to ``Index``: - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b']) - index - 'd' in index - -You can also pass a ``name`` to be stored in the index: - - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b'], name='something') - index.name - -Starting with pandas 0.5, the name, if set, will be shown in the console -display: - -.. ipython:: python - - index = Index(list(range(5)), name='rows') - columns = Index(['A', 'B', 'C'], name='cols') - df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) - df - df['A'] - - -Set operations on Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.set_ops: - -The three main operations are ``union (|)``, ``intersection (&)``, and ``diff -(-)``. These can be directly called as instance methods or used via overloaded -operators: - -.. ipython:: python - - a = Index(['c', 'b', 'a']) - b = Index(['c', 'e', 'd']) - a.union(b) - a | b - a & b - a - b - -``isin`` method of Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One additional operation is the ``isin`` method that works analogously to the -``Series.isin`` method found :ref:`here `. - .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) @@ -1189,7 +1129,7 @@ are named. .. ipython:: python - s.index.names = ['L1', 'L2'] + s.index.set_names(['L1', 'L2'], inplace=True) s.sortlevel(level='L1') s.sortlevel(level='L2') @@ -1276,6 +1216,88 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +.. _indexing.class: + +Index objects +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create one directly +is to pass a list or other sequence to ``Index``: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(list(range(5)), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_ops: + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + +``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One additional operation is the ``isin`` method that works analogously to the +``Series.isin`` method found :ref:`here `. + +Setting index metadata (``name(s)``, ``levels``, ``labels``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_metadata: + +Indexes are "mostly immutable", but it is possible to set and change their +metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and +``labels``). + +You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` +to set these attributes directly. They default to returning a copy; however, +you can specify ``inplace=True`` to have the data change inplace. + +.. ipython:: python + + ind = Index([1, 2, 3]) + ind.rename("apple") + ind + ind.set_names(["apple"], inplace=True) + ind.name = "bob" + ind + Adding an index to an existing DataFrame ---------------------------------------- diff --git a/doc/source/release.rst b/doc/source/release.rst index 769b47b18db08..58dcce9b9e290 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -47,6 +47,9 @@ pandas 0.13 - Added a more informative error message when plot arguments contain overlapping color and style arguments (:issue:`4402`) - Significant table writing performance improvements in ``HDFStore`` + - Add ``rename`` and ``set_names`` methods to ``Index`` as well as + ``set_names``, ``set_levels``, ``set_labels`` to ``MultiIndex``. + (:issue:`4039`) **API Changes** @@ -66,6 +69,7 @@ pandas 0.13 an alias of iteritems used to get around ``2to3``'s changes). (:issue:`4384`, :issue:`4375`, :issue:`4372`) - ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`) + - ``HDFStore`` - added an ``is_open`` property to indicate if the underlying file handle is_open; @@ -83,6 +87,21 @@ pandas 0.13 be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`) + - ``Index`` and ``MultiIndex`` changes (:issue:`4039`): + + - Setting ``levels`` and ``labels`` directly on ``MultiIndex`` is now + deprecated. Instead, you can use the ``set_levels()`` and + ``set_labels()`` methods. + - ``levels``, ``labels`` and ``names`` properties no longer return lists, + but instead return containers that do not allow setting of items + ('mostly immutable') + - ``levels``, ``labels`` and ``names`` are validated upon setting and are + either copied or shallow-copied. + - ``__deepcopy__`` now returns a shallow copy (currently: a view) of the + data - allowing metadata changes. + - ``MultiIndex.astype()`` now only allows ``np.object_``-like dtypes and + now returns a ``MultiIndex`` rather than an ``Index``. (:issue:`4039`) + **Experimental Features** **Bug Fixes** @@ -136,6 +155,10 @@ pandas 0.13 - frozenset objects now raise in the ``Series`` constructor (:issue:`4482`, :issue:`4480`) - Fixed issue with sorting a duplicate multi-index that has multiple dtypes (:issue:`4516`) + - Fixed bug in ``DataFrame.set_values`` which was causing name attributes to + be lost when expanding the index. (:issue:`3742`, :issue:`4039`) + - Fixed issue where individual ``names``, ``levels`` and ``labels`` could be + set on ``MultiIndex`` without validation (:issue:`3714`, :issue:`4039`) pandas 0.12 =========== diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 7da2f03ad4c74..05bae7a952612 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -72,6 +72,24 @@ API changes import os os.remove(path) + - Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``, + ``labels``, and ``names``) (:issue:`4039`): + + ..code-block :: + + # previously, you would have set levels or labels directly + index.levels = [[1, 2, 3, 4], [1, 2, 4, 4]] + + # now, you use the set_levels or set_labels methods + index = index.set_levels([[1, 2, 3, 4], [1, 2, 4, 4]]) + + # similarly, for names, you can rename the object + # but setting names is not deprecated. + index = index.set_names(["bob", "cranberry"]) + + # and all methods take an inplace kwarg + index.set_names(["bob", "cranberry"], inplace=True) + Enhancements ~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f3bcb32f7287..20a2dab06368b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1150,7 +1150,7 @@ def to_records(self, index=True, convert_datetime64=True): arrays = ix_vals+ [self[c].values for c in self.columns] count = 0 - index_names = self.index.names + index_names = list(self.index.names) if isinstance(self.index, MultiIndex): for i, n in enumerate(index_names): if n is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0eaae228da627..2ee7f791c671f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -404,7 +404,7 @@ def drop(self, labels, axis=0, level=None): new_axis = axis.drop(labels) dropped = self.reindex(**{axis_name: new_axis}) try: - dropped.axes[axis_].names = axis.names + dropped.axes[axis_].set_names(axis.names, inplace=True) except AttributeError: pass return dropped diff --git a/pandas/core/index.py b/pandas/core/index.py index 3e24788ec411d..9b7960389ad51 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,5 +1,5 @@ # pylint: disable=E1101,E1103,W0232 - +from functools import partial from pandas.compat import range, zip, lrange, lzip from pandas import compat import numpy as np @@ -11,10 +11,11 @@ from pandas.lib import Timestamp from pandas.core.base import FrozenList, FrozenNDArray -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate from pandas.core.common import isnull import pandas.core.common as com from pandas.core.config import get_option +import warnings __all__ = ['Index'] @@ -38,6 +39,7 @@ def wrapper(self, other): class InvalidIndexError(Exception): pass + _o_dtype = np.dtype(object) @@ -129,7 +131,8 @@ def __new__(cls, data, dtype=None, copy=False, name=None, **kwargs): return PeriodIndex(subarr, name=name, **kwargs) subarr = subarr.view(cls) - subarr.name = name + # could also have a _set_name, but I don't think it's really necessary + subarr._set_names([name]) return subarr def __array_finalize__(self, obj): @@ -197,7 +200,7 @@ def nlevels(self): # for compat with multindex code def _get_names(self): - return [self.name] + return FrozenList((self.name,)) def _set_names(self, values): if len(values) != 1: @@ -207,6 +210,31 @@ def _set_names(self, values): names = property(fset=_set_names, fget=_get_names) + def set_names(self, names, inplace=False): + """ + Set new names on index. Defaults to returning new index. + + Parameters + ---------- + names : sequence + names to set + inplace : bool + if True, mutates in place + + Returns + ------- + new index (of same type and class...etc) + """ + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_names(names) + return idx + + def rename(self, name, inplace=False): + return self.set_names([name], inplace=inplace) + @property def _has_complex_internals(self): # to disable groupby tricks in MultiIndex @@ -761,7 +789,8 @@ def get_level_values(self, level): ------- values : ndarray """ - num = self._get_level_number(level) + # checks that level number is actually just 1 + self._get_level_number(level) return self def get_indexer(self, target, method=None, limit=None): @@ -804,8 +833,8 @@ def get_indexer(self, target, method=None, limit=None): return this.get_indexer(target, method=method, limit=limit) if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely valued Index ' - 'objects') + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') if method == 'pad': if not self.is_monotonic: @@ -1421,8 +1450,10 @@ class MultiIndex(Index): names : optional sequence of objects Names for each of the index levels. """ - # shadow property - names = None + # initialize to zero-length tuples to make everything work + _names = FrozenList() + _levels = FrozenList() + _labels = FrozenList() def __new__(cls, levels=None, labels=None, sortorder=None, names=None): if len(levels) != len(labels): @@ -1438,26 +1469,14 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None): return Index(levels[0], name=name).take(labels[0]) - levels = [_ensure_index(lev) for lev in levels] - labels = [np.asarray(labs, dtype=np.int_) for labs in labels] - # v3, 0.8.0 subarr = np.empty(0, dtype=object).view(cls) - subarr.levels = levels - subarr.labels = labels + subarr._set_levels(levels) + subarr._set_labels(labels) - if names is None: - subarr.names = [None] * subarr.nlevels - else: - if len(names) != subarr.nlevels: - raise AssertionError(('Length of names (%d) must be same as level ' - '(%d)') % (len(names),subarr.nlevels)) + if names is not None: + subarr._set_names(names) - subarr.names = list(names) - - # set the name - for i, name in enumerate(subarr.names): - subarr.levels[i].name = name if sortorder is not None: subarr.sortorder = int(sortorder) @@ -1466,6 +1485,123 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None): return subarr + def _get_levels(self): + return self._levels + + + def _set_levels(self, levels): + # This is NOT part of the levels property because it should be + # externally not allowed to set levels. User beware if you change + # _levels directly + if len(levels) == 0: + raise ValueError("Must set non-zero number of levels.") + levels = FrozenList(_ensure_index(lev)._shallow_copy() + for lev in levels) + names = self.names + self._levels = levels + if len(names): + self._set_names(names) + + def set_levels(self, levels, inplace=False): + """ + Set new levels on MultiIndex. Defaults to returning + new index. + + Parameters + ---------- + levels : sequence + new levels to apply + inplace : bool + if True, mutates in place + + Returns + ------- + new index (of same type and class...etc) + """ + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_levels(levels) + return idx + + # remove me in 0.14 and change to read only property + __set_levels = deprecate("setting `levels` directly", + partial(set_levels, inplace=True), + alt_name="set_levels") + levels = property(fget=_get_levels, fset=__set_levels) + + def _get_labels(self): + return self._labels + + def _set_labels(self, labels): + if len(labels) != self.nlevels: + raise ValueError("Length of levels and labels must be the same.") + self._labels = FrozenList(_ensure_frozen(labs)._shallow_copy() for labs in labels) + + def set_labels(self, labels, inplace=False): + """ + Set new labels on MultiIndex. Defaults to returning + new index. + + Parameters + ---------- + labels : sequence + new levels to apply + inplace : bool + if True, mutates in place + + Returns + ------- + new index (of same type and class...etc) + """ + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_labels(labels) + return idx + + # remove me in 0.14 and change to readonly property + __set_labels = deprecate("setting labels directly", + partial(set_labels, inplace=True), + alt_name="set_labels") + labels = property(fget=_get_labels, fset=__set_labels) + + def copy(self, names=None, dtype=None, levels=None, labels=None, + deep=False): + """ + Make a copy of this object. Names, dtype, levels and labels can be + passed and will be set on new copy. + + Parameters + ---------- + names : sequence, optional + dtype : numpy dtype or pandas type, optional + levels : sequence, optional + labels : sequence, optional + + Returns + ------- + copy : MultiIndex + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + This could be potentially expensive on large MultiIndex objects. + """ + new_index = np.ndarray.copy(self) + if levels is not None: + new_index = new_index.set_levels(levels) + if labels is not None: + new_index = new_index.set_labels(labels) + if names is not None: + new_index = new_index.set_names(names) + if dtype: + new_index = new_index.astype(dtype) + return new_index + def __array_finalize__(self, obj): """ Update custom MultiIndex attributes when a new array is created by @@ -1476,9 +1612,9 @@ def __array_finalize__(self, obj): # instance. return - self.levels = list(getattr(obj, 'levels', [])) - self.labels = list(getattr(obj, 'labels', [])) - self.names = list(getattr(obj, 'names', [])) + self._set_levels(getattr(obj, 'levels', [])) + self._set_labels(getattr(obj, 'labels', [])) + self._set_names(getattr(obj, 'names', [])) self.sortorder = getattr(obj, 'sortorder', None) def _array_values(self): @@ -1505,6 +1641,26 @@ def __unicode__(self): def __len__(self): return len(self.labels[0]) + def _get_names(self): + return FrozenList(level.name for level in self.levels) + + def _set_names(self, values): + """ + sets names on levels. WARNING: mutates! + + Note that you generally want to set this *after* changing levels, so that it only + acts on copies""" + values = list(values) + if len(values) != self.nlevels: + raise ValueError('Length of names (%d) must be same as level ' + '(%d)' % (len(values),self.nlevels)) + # set the name + for name, level in zip(values, self.levels): + level.rename(name, inplace=True) + + + names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") + def _format_native_types(self, **kwargs): return self.tolist() @@ -1520,9 +1676,9 @@ def inferred_type(self): def _from_elements(values, labels=None, levels=None, names=None, sortorder=None): index = values.view(MultiIndex) - index.levels = levels - index.labels = labels - index.names = names + index._set_levels(levels) + index._set_labels(labels) + index._set_names(names) index.sortorder = sortorder return index @@ -1832,9 +1988,9 @@ def __setstate__(self, state): np.ndarray.__setstate__(self, nd_state) levels, labels, sortorder, names = own_state - self.levels = [Index(x) for x in levels] - self.labels = labels - self.names = names + self._set_levels([Index(x) for x in levels]) + self._set_labels(labels) + self._set_names(names) self.sortorder = sortorder def __getitem__(self, key): @@ -1859,10 +2015,10 @@ def __getitem__(self, key): new_labels = [lab[key] for lab in self.labels] # an optimization - result.levels = list(self.levels) - result.labels = new_labels + result._set_levels(self.levels) + result._set_labels(new_labels) result.sortorder = sortorder - result.names = self.names + result._set_names(self.names) return result @@ -2155,7 +2311,6 @@ def reindex(self, target, method=None, level=None, limit=None, """ if level is not None: if method is not None: - # FIXME: Should this actually be a TypeError [given that it's a signature issue] or ValueError raise TypeError('Fill method not supported if level passed') target, indexer, _ = self._join_level(target, level, how='right', return_indexers=True) @@ -2545,7 +2700,8 @@ def diff(self, other): try: other = MultiIndex.from_tuples(other) except: - raise TypeError("other must be a MultiIndex or a list of tuples") + raise TypeError('other must be a MultiIndex or a list of' + ' tuples') result_names = self.names else: result_names = self.names if self.names == other.names else None @@ -2568,6 +2724,11 @@ def diff(self, other): def _assert_can_do_setop(self, other): pass + def astype(self, dtype): + if np.dtype(dtype) != np.object_: + raise TypeError("Setting %s dtype to anything other than object is not supported" % self.__class__) + return self._shallow_copy() + def insert(self, loc, item): """ Make new MultiIndex inserting new item at location @@ -2676,7 +2837,11 @@ def _ensure_index(index_like): if hasattr(index_like, 'name'): return Index(index_like, name=index_like.name) + # must check for exactly list here because of strict type + # check in clean_index_list if isinstance(index_like, list): + if type(index_like) != list: + index_like = list(index_like) # #2200 ? converted, all_arrays = lib.clean_index_list(index_like) @@ -2687,6 +2852,13 @@ def _ensure_index(index_like): return Index(index_like) +def _ensure_frozen(nd_array_like): + if isinstance(nd_array_like, FrozenNDArray): + return nd_array_like + else: + arr = np.asarray(nd_array_like, dtype=np.int_) + return arr.view(FrozenNDArray) + def _validate_join_method(method): if method not in ['left', 'right', 'inner', 'outer']: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b69e4a6a96acc..4596b93d79778 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -92,8 +92,8 @@ def _make_index(lev,lab): def _make_sorted_values_labels(self): v = self.level - labs = self.index.labels - levs = self.index.levels + labs = list(self.index.labels) + levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] @@ -206,8 +206,8 @@ def get_new_columns(self): width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): - new_levels = self.value_columns.levels + [self.removed_level] - new_names = self.value_columns.names + [self.removed_name] + new_levels = self.value_columns.levels + (self.removed_level,) + new_names = self.value_columns.names + (self.removed_name,) new_labels = [lab.take(propagator) for lab in self.value_columns.labels] diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a6c8584441daf..3b132be800cb1 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -774,7 +774,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): # add names for the index if indexnamerow: coffset = len(indexnamerow) - len(columns) - index.names = indexnamerow[:coffset] + index = index.set_names(indexnamerow[:coffset]) # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 1ac4d4e31ed10..3f41be6ae64c6 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -675,7 +675,7 @@ def _check_excel_multiindex_dates(self, ext): recons = reader.parse('test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons, check_names=False) - self.assertEquals(recons.index.names, ['time', 'foo']) + self.assertEquals(recons.index.names, ('time', 'foo')) # infer index tsframe.to_excel(path, 'test1') diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index d9a4196a6efb3..41345352b5ec5 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -821,7 +821,7 @@ def test_parse_dates_column_list(self): levels[0] = lev.to_datetime(dayfirst=True) # hack to get this to work - remove for final test levels[0].name = lev.name - expected.index.levels = levels + expected.index.set_levels(levels, inplace=True) expected['aux_date'] = to_datetime(expected['aux_date'], dayfirst=True) expected['aux_date'] = lmap(Timestamp, expected['aux_date']) @@ -1339,7 +1339,7 @@ def test_read_table_buglet_4x_multiindex(self): # it works! df = self.read_table(StringIO(text), sep='\s+') - self.assertEquals(df.index.names, ['one', 'two', 'three', 'four']) + self.assertEquals(df.index.names, ('one', 'two', 'three', 'four')) def test_read_csv_parse_simple_list(self): text = """foo diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9575d99229dc4..3bc32fb3f5a32 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1921,7 +1921,7 @@ def test_store_hierarchical(self): with ensure_clean(self.path) as store: store['frame'] = frame recons = store['frame'] - assert(recons.index.names == ['foo', 'bar']) + assert(recons.index.names == ('foo', 'bar')) def test_store_index_name(self): df = tm.makeDataFrame() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9e49d79de65a3..7043698ea6476 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3526,7 +3526,7 @@ def create_dict(order_id): # MultiIndex result = DataFrame.from_records(documents, index=['order_id', 'quantity']) - self.assert_(result.index.names == ['order_id', 'quantity']) + self.assert_(result.index.names == ('order_id', 'quantity')) def test_from_records_misc_brokenness(self): # #2179 @@ -7239,7 +7239,7 @@ def test_pivot(self): # don't specify values pivoted = frame.pivot(index='index', columns='columns') self.assertEqual(pivoted.index.name, 'index') - self.assertEqual(pivoted.columns.names, [None, 'columns']) + self.assertEqual(pivoted.columns.names, (None, 'columns')) # pivot multiple columns wp = tm.makePanel() diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 19f15e44dc096..9e7cdf9df2c6b 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1283,13 +1283,13 @@ def desc3(group): return result result = grouped.apply(desc) - self.assertEquals(result.index.names, ['A', 'B', 'stat']) + self.assertEquals(result.index.names, ('A', 'B', 'stat')) result2 = grouped.apply(desc2) - self.assertEquals(result2.index.names, ['A', 'B', 'stat']) + self.assertEquals(result2.index.names, ('A', 'B', 'stat')) result3 = grouped.apply(desc3) - self.assertEquals(result3.index.names, ['A', 'B', None]) + self.assertEquals(result3.index.names, ('A', 'B', None)) def test_nonsense_func(self): df = DataFrame([0]) @@ -1519,7 +1519,7 @@ def f(piece): def test_apply_series_yield_constant(self): result = self.df.groupby(['A', 'B'])['C'].apply(len) - self.assertEquals(result.index.names[:2], ['A', 'B']) + self.assertEquals(result.index.names[:2], ('A', 'B')) def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) @@ -1836,7 +1836,7 @@ def test_groupby_series_with_name(self): result = self.df.groupby([self.df['A'], self.df['B']]).mean() result2 = self.df.groupby([self.df['A'], self.df['B']], as_index=False).mean() - self.assertEquals(result.index.names, ['A', 'B']) + self.assertEquals(result.index.names, ('A', 'B')) self.assert_('A' in result2) self.assert_('B' in result2) @@ -2332,7 +2332,7 @@ def test_no_dummy_key_names(self): result = self.df.groupby([self.df['A'].values, self.df['B'].values]).sum() - self.assert_(result.index.names == [None, None]) + self.assert_(result.index.names == (None, None)) def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7f889f464916e..b6c32a1e49405 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,7 +12,8 @@ from numpy.testing import assert_array_equal from pandas.core.index import Index, Int64Index, MultiIndex, InvalidIndexError -from pandas.util.testing import assert_almost_equal, assertRaisesRegexp +from pandas.util.testing import(assert_almost_equal, assertRaisesRegexp, + assert_copy) from pandas import compat import pandas.util.testing as tm @@ -465,7 +466,7 @@ def test_slice_locs_dup(self): rs = idx.slice_locs('a', 'd') self.assert_(rs == (0, 6)) - rs2 = idx.slice_locs(end='d') + rs = idx.slice_locs(end='d') self.assert_(rs == (0, 6)) rs = idx.slice_locs('a', 'c') @@ -496,11 +497,10 @@ def test_tuple_union_bug(self): import pandas import numpy as np - aidx1 = np.array( - [(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], dtype=[('num', - int), ('let', 'a1')]) + aidx1 = np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], + dtype=[('num', int), ('let', 'a1')]) aidx2 = np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B'), (1, 'C'), (2, - 'C')], dtype=[('num', int), ('let', 'a1')]) + 'C')], dtype=[('num', int), ('let', 'a1')]) idx1 = pandas.Index(aidx1) idx2 = pandas.Index(aidx2) @@ -931,7 +931,7 @@ def test_print_unicode_columns(self): repr(df.columns) # should not raise UnicodeDecodeError def test_repr_summary(self): - with cf.option_context('display.max_seq_items',10): + with cf.option_context('display.max_seq_items', 10): r = repr(pd.Index(np.arange(1000))) self.assertTrue(len(r) < 100) self.assertTrue("..." in r) @@ -965,10 +965,52 @@ def setUp(self): major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) - + self.index_names = ['first', 'second'] self.index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels], - names=['first', 'second']) + names=self.index_names) + + def test_names(self): + + # names are assigned in __init__ + names = self.index_names + level_names = [level.name for level in self.index.levels] + self.assertEqual(names, level_names) + + # setting bad names on existing + index = self.index + assertRaisesRegexp(ValueError, "^Length of names", setattr, index, + "names", list(index.names) + ["third"]) + assertRaisesRegexp(ValueError, "^Length of names", setattr, index, + "names", []) + + # initializing with bad names (should always be equivalent) + major_axis, minor_axis = self.index.levels + major_labels, minor_labels = self.index.labels + assertRaisesRegexp(ValueError, "^Length of names", MultiIndex, + levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=['first']) + assertRaisesRegexp(ValueError, "^Length of names", MultiIndex, + levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=['first', 'second', 'third']) + + # names are assigned + index.names = ["a", "b"] + ind_names = list(index.names) + level_names = [level.name for level in index.levels] + self.assertEqual(ind_names, level_names) + + def test_astype(self): + expected = self.index.copy() + actual = self.index.astype('O') + assert_copy(actual.levels, expected.levels) + assert_copy(actual.labels, expected.labels) + self.check_level_names(actual, expected.names) + + assertRaisesRegexp(TypeError, "^Setting.*dtype.*object", self.index.astype, np.dtype(int)) + def test_constructor_single_level(self): single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], @@ -993,49 +1035,64 @@ def test_constructor_mismatched_label_levels(self): " the same", MultiIndex, levels=levels, labels=labels) - def test_copy(self): - i_copy = self.index.copy() + def assert_multiindex_copied(self, copy, original): + # levels shoudl be (at least, shallow copied) + assert_copy(copy.levels, original.levels) - # Equal...but not the same object - self.assert_(i_copy.levels == self.index.levels) - self.assert_(i_copy.levels is not self.index.levels) + assert_almost_equal(copy.labels, original.labels) - self.assert_(i_copy.labels == self.index.labels) - self.assert_(i_copy.labels is not self.index.labels) + # labels doesn't matter which way copied + assert_almost_equal(copy.labels, original.labels) + self.assert_(copy.labels is not original.labels) - self.assert_(i_copy.names == self.index.names) - self.assert_(i_copy.names is not self.index.names) + # names doesn't matter which way copied + self.assert_(copy.names == original.names) + self.assert_(copy.names is not original.names) - self.assert_(i_copy.sortorder == self.index.sortorder) + # sort order should be copied + self.assert_(copy.sortorder == original.sortorder) - def test_shallow_copy(self): - i_copy = self.index._shallow_copy() + def test_copy(self): + i_copy = self.index.copy() - # Equal...but not the same object - self.assert_(i_copy.levels == self.index.levels) - self.assert_(i_copy.levels is not self.index.levels) + self.assert_multiindex_copied(i_copy, self.index) - self.assert_(i_copy.labels == self.index.labels) - self.assert_(i_copy.labels is not self.index.labels) - self.assert_(i_copy.names == self.index.names) - self.assert_(i_copy.names is not self.index.names) + def test_shallow_copy(self): + i_copy = self.index._shallow_copy() - self.assert_(i_copy.sortorder == self.index.sortorder) + self.assert_multiindex_copied(i_copy, self.index) def test_view(self): i_view = self.index.view() - # Equal...but not the same object - self.assert_(i_view.levels == self.index.levels) - self.assert_(i_view.levels is not self.index.levels) + self.assert_multiindex_copied(i_view, self.index) + + def check_level_names(self, index, names): + self.assertEqual([level.name for level in index.levels], list(names)) + + def test_changing_names(self): + # names should be applied to levels + level_names = [level.name for level in self.index.levels] + self.check_level_names(self.index, self.index.names) - self.assert_(i_view.labels == self.index.labels) - self.assert_(i_view.labels is not self.index.labels) + view = self.index.view() + copy = self.index.copy() + shallow_copy = self.index._shallow_copy() - self.assert_(i_view.names == self.index.names) - self.assert_(i_view.names is not self.index.names) - self.assert_(i_view.sortorder == self.index.sortorder) + # changing names should change level names on object + new_names = [name + "a" for name in self.index.names] + self.index.names = new_names + self.check_level_names(self.index, new_names) + + # but not on copies + self.check_level_names(view, level_names) + self.check_level_names(copy, level_names) + self.check_level_names(shallow_copy, level_names) + + # and copies shouldn't change original + shallow_copy.names = [name + "c" for name in shallow_copy.names] + self.check_level_names(self.index, new_names) def test_duplicate_names(self): self.index.names = ['foo', 'foo'] @@ -1287,7 +1344,8 @@ def test_slice_locs_not_sorted(self): # works sorted_index, _ = index.sortlevel(0) - result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) + # should there be a test case here??? + sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) def test_slice_locs_partial(self): sorted_idx, _ = self.index.sortlevel(0) @@ -1589,7 +1647,7 @@ def test_diff(self): chunklet = self.index[-3:] chunklet.names = ['foo', 'baz'] result = first - chunklet - self.assertEqual(result.names, [None, None]) + self.assertEqual(result.names, (None, None)) # empty, but non-equal result = self.index - self.index.sortlevel(1)[0] @@ -1606,13 +1664,30 @@ def test_diff(self): # name from non-empty array result = first.diff([('foo', 'one')]) - expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), - ('qux', 'one'), ('qux', 'two')]) + expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), + ('foo', 'two'), ('qux', 'one'), + ('qux', 'two')]) expected.names = first.names self.assertEqual(first.names, result.names) assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list" " of tuples", first.diff, [1,2,3,4,5]) + def test_set_value_keeps_names(self): + # motivating example from #3742 + lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] + lev2 = ['1', '2', '3'] * 2 + idx = pd.MultiIndex.from_arrays( + [lev1, lev2], + names=['Name', 'Number']) + df = pd.DataFrame( + np.random.randn(6, 4), + columns=['one', 'two', 'three', 'four'], + index=idx) + df = df.sortlevel() + self.assertEqual(tuple(df.index.names), ('Name', 'Number')) + df = df.set_value(('grethe', 'hans'), 'one', 99.34) + self.assertEqual(tuple(df.index.names), ('Name', 'Number')) + def test_from_tuples(self): assertRaisesRegexp(TypeError, 'Cannot infer number of levels from' ' empty list', MultiIndex.from_tuples, []) @@ -1708,7 +1783,7 @@ def test_droplevel_with_names(self): np.array([1, 0, 1, 1, 0, 0, 1, 0])], names=['one', 'two', 'three']) dropped = index.droplevel(0) - self.assertEqual(dropped.names, ['two', 'three']) + self.assertEqual(dropped.names, ('two', 'three')) dropped = index.droplevel('two') expected = index.droplevel(1) @@ -1803,10 +1878,12 @@ def test_join_self(self): def test_reindex(self): result, indexer = self.index.reindex(list(self.index[:4])) tm.assert_isinstance(result, MultiIndex) + self.check_level_names(result, self.index[:4].names) result, indexer = self.index.reindex(list(self.index)) tm.assert_isinstance(result, MultiIndex) self.assert_(indexer is None) + self.check_level_names(result, self.index.names) def test_reindex_level(self): idx = Index(['one']) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 7379bf5d148dc..c903af1860421 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -55,9 +55,11 @@ def setUp(self): lambda x: x.day]).sum() # use Int64Index, to make sure things work - self.ymd.index.levels = [lev.astype('i8') - for lev in self.ymd.index.levels] - self.ymd.index.names = ['year', 'month', 'day'] + self.ymd.index.set_levels([lev.astype('i8') + for lev in self.ymd.index.levels], + inplace=True) + self.ymd.index.set_names(['year', 'month', 'day'], + inplace=True) def test_append(self): a, b = self.frame[:5], self.frame[5:] @@ -1667,7 +1669,7 @@ def test_drop_preserve_names(self): df = DataFrame(np.random.randn(6, 3), index=index) result = df.drop([(0, 2)]) - self.assert_(result.index.names == ['one', 'two']) + self.assert_(result.index.names == ('one', 'two')) def test_unicode_repr_issues(self): levels = [Index([u('a/\u03c3'), u('b/\u03c3'), u('c/\u03c3')]), diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index e8b50e21f694a..c5f9f962f4646 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1227,7 +1227,7 @@ def test_to_frame(self): assert_panel_equal(unfiltered.to_panel(), self.panel) # names - self.assertEqual(unfiltered.index.names, ['major', 'minor']) + self.assertEqual(unfiltered.index.names, ('major', 'minor')) # unsorted, round trip df = self.panel.to_frame(filter_observations=False) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 3c6ab18126e8f..eddddb42b680e 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -898,7 +898,7 @@ def test_to_frame(self): # assert_panel_equal(unfiltered.to_panel(), self.panel) # # names - # self.assertEqual(unfiltered.index.names, ['major', 'minor']) + # self.assertEqual(unfiltered.index.names, ('major', 'minor')) def test_to_frame_mixed(self): raise nose.SkipTest diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index effcc3ff7695f..9f497e50df802 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -217,7 +217,7 @@ def _all_key(key): row_names = result.index.names result = result.append(margin_dummy) - result.index.names = row_names + result.index = result.index.set_names(row_names) return result diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 1008e23c3ebcd..67adc6bf8e7f2 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1299,7 +1299,7 @@ def test_concat_multiindex_with_keys(self): columns=Index(['A', 'B', 'C'], name='exp')) result = concat([frame, frame], keys=[0, 1], names=['iteration']) - self.assertEqual(result.index.names, ['iteration'] + index.names) + self.assertEqual(result.index.names, ('iteration',) + index.names) tm.assert_frame_equal(result.ix[0], frame) tm.assert_frame_equal(result.ix[1], frame) self.assertEqual(result.index.nlevels, 3) @@ -1330,14 +1330,14 @@ def test_concat_keys_and_levels(self): keys=[('foo', 'one'), ('foo', 'two'), ('baz', 'one'), ('baz', 'two')], levels=levels) - self.assertEqual(result.index.names, [None] * 3) + self.assertEqual(result.index.names, (None,) * 3) # no levels result = concat([df, df2, df, df2], keys=[('foo', 'one'), ('foo', 'two'), ('baz', 'one'), ('baz', 'two')], names=['first', 'second']) - self.assertEqual(result.index.names, ['first', 'second'] + [None]) + self.assertEqual(result.index.names, ('first', 'second') + (None,)) self.assert_(np.array_equal(result.index.levels[0], ['baz', 'foo'])) def test_concat_keys_levels_no_overlap(self): @@ -1363,7 +1363,9 @@ def test_concat_rename_index(self): names=['lvl0', 'lvl1']) exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) - exp.index.names[1] = 'lvl1' + names = list(exp.index.names) + names[1] = 'lvl1' + exp.index.set_names(names, inplace=True) tm.assert_frame_equal(result, exp) self.assertEqual(result.index.names, exp.index.names) @@ -1391,7 +1393,7 @@ def test_crossed_dtypes_weird_corner(self): df2 = DataFrame(np.random.randn(1, 4), index=['b']) result = concat( [df, df2], keys=['one', 'two'], names=['first', 'second']) - self.assertEqual(result.index.names, ['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second')) def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 57e7d2f7f6ae9..1718648f81157 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -42,7 +42,7 @@ def test_pivot_table(self): pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: - self.assertEqual(table.index.names, rows) + self.assertEqual(table.index.names, tuple(rows)) else: self.assertEqual(table.index.name, rows[0]) @@ -365,7 +365,7 @@ def test_crosstab_margins(self): result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), margins=True) - self.assertEqual(result.index.names, ['a']) + self.assertEqual(result.index.names, ('a',)) self.assertEqual(result.columns.names, ['b', 'c']) all_cols = result['All', ''] diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 8c6744cbf2963..d83b1eb778763 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -4,8 +4,8 @@ import warnings -def deprecate(name, alternative): - alt_name = alternative.__name__ +def deprecate(name, alternative, alt_name=None): + alt_name = alt_name or alternative.__name__ def wrapper(*args, **kwargs): warnings.warn("%s is deprecated. Use %s instead" % (name, alt_name), From 5cad4d205d1878561ac867591ebe4c12e6ca17c5 Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Thu, 4 Jul 2013 11:57:24 -0400 Subject: [PATCH 4/4] ENH: Additional keyword arguments for Index.copy() * Index derivatives can set `name` or `names` as well as `dtype` on copy. MultiIndex can set `levels`, `labels`, and `names`. * Also, `__deepcopy__` just calls `copy(deep=True)` * Now, BlockManager.copy() takes an additional argument `copy_axes` which copies axes as well. Defaults to False. * `Series.copy()` takes an optional deep argument, which causes it to copy its index. * `DataFrame.copy()` passes `copy_axes=True` when deepcopying. * Add copy kwarg to MultiIndex `__new__` --- doc/source/release.rst | 3 + pandas/core/index.py | 107 ++++++++++++++++++++++++++++-------- pandas/core/series.py | 21 ++++++- pandas/tests/test_common.py | 1 + pandas/tests/test_index.py | 85 +++++++++++++++++++--------- 5 files changed, 163 insertions(+), 54 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 58dcce9b9e290..ab7a347ef0c58 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -47,6 +47,9 @@ pandas 0.13 - Added a more informative error message when plot arguments contain overlapping color and style arguments (:issue:`4402`) - Significant table writing performance improvements in ``HDFStore`` + - ``Index.copy()`` and ``MultiIndex.copy()`` now accept keyword arguments to + change attributes (i.e., ``names``, ``levels``, ``labels``) + (:issue:`4039`) - Add ``rename`` and ``set_names`` methods to ``Index`` as well as ``set_names``, ``set_levels``, ``set_labels`` to ``MultiIndex``. (:issue:`4039`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 9b7960389ad51..7be19302d88d5 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -110,6 +110,12 @@ def __new__(cls, data, dtype=None, copy=False, name=None, **kwargs): return Int64Index(data, copy=copy, dtype=dtype, name=name) subarr = com._asarray_tuplesafe(data, dtype=object) + + # _asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + if copy: + subarr = subarr.copy() + elif np.isscalar(data): raise TypeError('Index(...) must be called with a collection ' 'of some kind, %s was passed' % repr(data)) @@ -120,7 +126,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, **kwargs): if dtype is None: inferred = lib.infer_dtype(subarr) if inferred == 'integer': - return Int64Index(subarr.astype('i8'), name=name) + return Int64Index(subarr.astype('i8'), copy=copy, name=name) elif inferred != 'string': if (inferred.startswith('datetime') or tslib.is_timestamp_array(subarr)): @@ -145,6 +151,41 @@ def __array_finalize__(self, obj): def _shallow_copy(self): return self.view() + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + if deep: + from copy import deepcopy + new_index = np.ndarray.__deepcopy__(self, {}).view(self.__class__) + name = name or deepcopy(self.name) + else: + new_index = super(Index, self).copy() + if name is not None: + names = [name] + if names: + new_index = new_index.set_names(names) + if dtype: + new_index = new_index.astype(dtype) + return new_index + def __unicode__(self): """ Return a string representation for a particular Index @@ -338,10 +379,7 @@ def __setstate__(self, state): np.ndarray.__setstate__(self, state) def __deepcopy__(self, memo={}): - """ - Index is not mutable, so disabling deepcopy - """ - return self._shallow_copy() + return self.copy(deep=True) def __contains__(self, key): hash(key) @@ -1440,9 +1478,9 @@ class MultiIndex(Index): Parameters ---------- - levels : list or tuple of arrays + levels : sequence of arrays The unique labels for each level - labels : list or tuple of arrays + labels : sequence of arrays Integers for each level designating which label at each location sortorder : optional int Level of sortedness (must be lexicographically sorted by that @@ -1455,7 +1493,8 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() - def __new__(cls, levels=None, labels=None, sortorder=None, names=None): + def __new__(cls, levels=None, labels=None, sortorder=None, names=None, + copy=False): if len(levels) != len(labels): raise ValueError( 'Length of levels and labels must be the same') @@ -1467,12 +1506,12 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None): else: name = None - return Index(levels[0], name=name).take(labels[0]) + return Index(levels[0], name=name, copy=True).take(labels[0]) # v3, 0.8.0 subarr = np.empty(0, dtype=object).view(cls) - subarr._set_levels(levels) - subarr._set_labels(labels) + subarr._set_levels(levels, copy=copy) + subarr._set_labels(labels, copy=copy) if names is not None: subarr._set_names(names) @@ -1489,13 +1528,13 @@ def _get_levels(self): return self._levels - def _set_levels(self, levels): + def _set_levels(self, levels, copy=False): # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly if len(levels) == 0: raise ValueError("Must set non-zero number of levels.") - levels = FrozenList(_ensure_index(lev)._shallow_copy() + levels = FrozenList(_ensure_index(lev, copy=copy)._shallow_copy() for lev in levels) names = self.names self._levels = levels @@ -1534,10 +1573,11 @@ def set_levels(self, levels, inplace=False): def _get_labels(self): return self._labels - def _set_labels(self, labels): + def _set_labels(self, labels, copy=False): if len(labels) != self.nlevels: raise ValueError("Length of levels and labels must be the same.") - self._labels = FrozenList(_ensure_frozen(labs)._shallow_copy() for labs in labels) + self._labels = FrozenList(_ensure_frozen(labs,copy=copy)._shallow_copy() + for labs in labels) def set_labels(self, labels, inplace=False): """ @@ -1546,8 +1586,8 @@ def set_labels(self, labels, inplace=False): Parameters ---------- - labels : sequence - new levels to apply + labels : sequence of arrays + new labels to apply inplace : bool if True, mutates in place @@ -1592,6 +1632,11 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, This could be potentially expensive on large MultiIndex objects. """ new_index = np.ndarray.copy(self) + if deep: + from copy import deepcopy + levels = levels if levels is not None else deepcopy(self.levels) + labels = labels if labels is not None else deepcopy(self.labels) + names = names if names is not None else deepcopy(self.names) if levels is not None: new_index = new_index.set_levels(levels) if labels is not None: @@ -2831,11 +2876,13 @@ def _sparsify(label_list, start=0,sentinal=''): return lzip(*result) -def _ensure_index(index_like): +def _ensure_index(index_like, copy=False): if isinstance(index_like, Index): + if copy: + index_like = index_like.copy() return index_like if hasattr(index_like, 'name'): - return Index(index_like, name=index_like.name) + return Index(index_like, name=index_like.name, copy=copy) # must check for exactly list here because of strict type # check in clean_index_list @@ -2849,15 +2896,27 @@ def _ensure_index(index_like): return MultiIndex.from_arrays(converted) else: index_like = converted + else: + # clean_index_list does the equivalent of copying + # so only need to do this if not list instance + if copy: + from copy import copy + index_like = copy(index_like) return Index(index_like) -def _ensure_frozen(nd_array_like): - if isinstance(nd_array_like, FrozenNDArray): - return nd_array_like - else: + +def _ensure_frozen(nd_array_like, copy=False): + if not isinstance(nd_array_like, FrozenNDArray): arr = np.asarray(nd_array_like, dtype=np.int_) - return arr.view(FrozenNDArray) + # have to do this separately so that non-index input gets copied + if copy: + arr = arr.copy() + nd_array_like = arr.view(FrozenNDArray) + else: + if copy: + nd_array_like = nd_array_like.copy() + return nd_array_like def _validate_join_method(method): diff --git a/pandas/core/series.py b/pandas/core/series.py index 58fd0a0551ace..e283058209e79 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1309,16 +1309,31 @@ def values(self): """ return self.view(ndarray) - def copy(self, order='C'): + def copy(self, order='C', deep=False): """ Return new Series with copy of underlying values + Parameters + ---------- + deep : boolean, default False + deep copy index along with data + order : boolean, default 'C' + order for underlying numpy array + Returns ------- cp : Series """ - return Series(self.values.copy(order), index=self.index, - name=self.name) + if deep: + from copy import deepcopy + index = self.index.copy(deep=deep) + name = deepcopy(self.name) + else: + index = self.index + name = self.name + + return Series(self.values.copy(order), index=index, + name=name) def tolist(self): """ diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3660b31b912fc..946e640d331cc 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -9,6 +9,7 @@ from pandas.tslib import iNaT from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp +import pandas.compat as compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull import pandas.compat as compat diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index b6c32a1e49405..a5f98107895a5 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,8 +12,10 @@ from numpy.testing import assert_array_equal from pandas.core.index import Index, Int64Index, MultiIndex, InvalidIndexError -from pandas.util.testing import(assert_almost_equal, assertRaisesRegexp, - assert_copy) +from pandas.core.frame import DataFrame +from pandas.core.series import Series +from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, + assert_copy) from pandas import compat import pandas.util.testing as tm @@ -54,12 +56,18 @@ def test_new_axis(self): self.assert_(new_index.ndim == 2) tm.assert_isinstance(new_index, np.ndarray) - def test_deepcopy(self): - from copy import deepcopy + def test_copy_and_deepcopy(self): + from copy import copy, deepcopy - copy = deepcopy(self.strIndex) - self.assert_(copy is not self.strIndex) - self.assert_(copy.equals(self.strIndex)) + for func in (copy, deepcopy): + idx_copy = func(self.strIndex) + self.assert_(idx_copy is not self.strIndex) + self.assert_(idx_copy.equals(self.strIndex)) + + new_copy = self.strIndex.copy(deep=True, name="banana") + self.assertEqual(new_copy.name, "banana") + new_copy2 = self.intIndex.copy(dtype=int) + self.assertEqual(new_copy2.dtype.kind, 'i') def test_duplicates(self): idx = Index([0, 0, 0]) @@ -88,6 +96,8 @@ def test_constructor(self): tm.assert_isinstance(index, Index) self.assert_(index.name == 'name') assert_array_equal(arr, index) + arr[0] = "SOMEBIGLONGSTRING" + self.assertNotEqual(index[0], "SOMEBIGLONGSTRING") # what to do here? # arr = np.array(5.) @@ -598,6 +608,15 @@ def test_constructor(self): # scalar raise Exception self.assertRaises(ValueError, Int64Index, 5) + # copy + arr = self.index.values + new_index = Int64Index(arr, copy=True) + self.assert_(np.array_equal(new_index, self.index)) + val = arr[0] + 3000 + # this should not change index + arr[0] = val + self.assertNotEqual(new_index[0], val) + def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = Int64Index(arr) @@ -970,6 +989,35 @@ def setUp(self): labels=[major_labels, minor_labels], names=self.index_names) + def test_copy_in_constructor(self): + levels = np.array(["a", "b", "c"]) + labels = np.array([1, 1, 2, 0, 0, 1, 1]) + val = labels[0] + mi = MultiIndex(levels=[levels, levels], labels=[labels, labels], + copy=True) + self.assertEqual(mi.labels[0][0], val) + labels[0] = 15 + self.assertEqual(mi.labels[0][0], val) + val = levels[0] + levels[0] = "PANDA" + self.assertEqual(mi.levels[0][0], val) + + def test_set_value_keeps_names(self): + # motivating example from #3742 + lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] + lev2 = ['1', '2', '3'] * 2 + idx = pd.MultiIndex.from_arrays( + [lev1, lev2], + names=['Name', 'Number']) + df = pd.DataFrame( + np.random.randn(6, 4), + columns=['one', 'two', 'three', 'four'], + index=idx) + df = df.sortlevel() + self.assertEqual(df.index.names, ('Name', 'Number')) + df = df.set_value(('grethe', '4'), 'one', 99.34) + self.assertEqual(df.index.names, ('Name', 'Number')) + def test_names(self): # names are assigned in __init__ @@ -1046,11 +1094,11 @@ def assert_multiindex_copied(self, copy, original): self.assert_(copy.labels is not original.labels) # names doesn't matter which way copied - self.assert_(copy.names == original.names) + self.assertEqual(copy.names, original.names) self.assert_(copy.names is not original.names) # sort order should be copied - self.assert_(copy.sortorder == original.sortorder) + self.assertEqual(copy.sortorder, original.sortorder) def test_copy(self): i_copy = self.index.copy() @@ -1469,7 +1517,7 @@ def test_get_indexer(self): # create index with duplicates idx1 = Index(lrange(10) + lrange(10)) - idx2 = Index(range(20)) + idx2 = Index(lrange(20)) assertRaisesRegexp(InvalidIndexError, "Reindexing only valid with" " uniquely valued Index objects", idx1.get_indexer, idx2) @@ -1672,22 +1720,6 @@ def test_diff(self): assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list" " of tuples", first.diff, [1,2,3,4,5]) - def test_set_value_keeps_names(self): - # motivating example from #3742 - lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] - lev2 = ['1', '2', '3'] * 2 - idx = pd.MultiIndex.from_arrays( - [lev1, lev2], - names=['Name', 'Number']) - df = pd.DataFrame( - np.random.randn(6, 4), - columns=['one', 'two', 'three', 'four'], - index=idx) - df = df.sortlevel() - self.assertEqual(tuple(df.index.names), ('Name', 'Number')) - df = df.set_value(('grethe', 'hans'), 'one', 99.34) - self.assertEqual(tuple(df.index.names), ('Name', 'Number')) - def test_from_tuples(self): assertRaisesRegexp(TypeError, 'Cannot infer number of levels from' ' empty list', MultiIndex.from_tuples, []) @@ -1957,7 +1989,6 @@ def test_get_combined_index(): result = _get_combined_index([]) assert(result.equals(Index([]))) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)