From 2851225ca2d8833e3bc0a064977fc1472f7be625 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 20 Sep 2018 21:00:35 +0100 Subject: [PATCH 01/12] rename MultiIndex.labels -> codes --- pandas/core/indexes/multi.py | 82 ++++++++++--------- .../tests/indexes/multi/test_constructor.py | 2 +- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4e5894916bd44..4c3aaf6b37f5c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -126,7 +126,12 @@ class MultiIndex(Index): ---------- levels : sequence of arrays The unique labels for each level + codes : sequence of arrays + Integers for each level designating which label at each location labels : sequence of arrays + .. deprecated:: 0.24.0 + Use ``codes`` instead + Integers for each level designating which label at each location sortorder : optional int Level of sortedness (must be lexicographically sorted by that @@ -170,6 +175,7 @@ class MultiIndex(Index): ---------- names levels + codes labels nlevels levshape @@ -195,7 +201,7 @@ class MultiIndex(Index): _typ = 'multiindex' _names = FrozenList() _levels = FrozenList() - _labels = FrozenList() + _codes = FrozenList() _comparables = ['names'] rename = Index.set_names @@ -220,7 +226,7 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, # we've already validated levels and labels, so shortcut here result._set_levels(levels, copy=copy, validate=False) - result._set_labels(labels, copy=copy, validate=False) + result._set_codes(labels, copy=copy, validate=False) if names is not None: # handles name validation @@ -237,39 +243,39 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, result._reset_identity() return result - def _verify_integrity(self, labels=None, levels=None): + def _verify_integrity(self, codes=None, levels=None): """ Parameters ---------- - labels : optional list - Labels to check for validity. Defaults to current labels. + codes : optional list + Codes to check for validity. Defaults to current codes. levels : optional list Levels to check for validity. Defaults to current levels. Raises ------ ValueError - If length of levels and labels don't match, if any label would + If length of levels and codes don't match, if any code would exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. - labels = labels or self.labels + codes = codes or self.labels levels = levels or self.levels - if len(levels) != len(labels): - raise ValueError("Length of levels and labels must match. NOTE:" + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - label_length = len(self.labels[0]) - for i, (level, label) in enumerate(zip(levels, labels)): - if len(label) != label_length: - raise ValueError("Unequal label lengths: %s" % - ([len(lab) for lab in labels])) - if len(label) and label.max() >= len(level): - raise ValueError("On level %d, label max (%d) >= length of" + codes_length = len(self.labels[0]) + for i, (level, level_codes) in enumerate(zip(levels, codes)): + if len(level_codes) != codes_length: + raise ValueError("Unequal code lengths: %s" % + ([len(code_) for code_ in codes])) + if len(level_codes) and level_codes.max() >= len(level): + raise ValueError("On level %d, code max (%d) >= length of" " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, label.max(), + " inconsistent state" % (i, level_codes.max(), len(level))) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " @@ -573,33 +579,33 @@ def set_levels(self, levels, level=None, inplace=False, @property def labels(self): - return self._labels + return self._codes - def _set_labels(self, labels, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_codes(self, codes, level=None, copy=False, validate=True, + verify_integrity=False): - if validate and level is None and len(labels) != self.nlevels: - raise ValueError("Length of labels must match number of levels") - if validate and level is not None and len(labels) != len(level): - raise ValueError('Length of labels must match length of levels.') + if validate and level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if validate and level is not None and len(codes) != len(level): + raise ValueError('Length of codes must match length of levels.') if level is None: - new_labels = FrozenList( - _ensure_frozen(lab, lev, copy=copy)._shallow_copy() - for lev, lab in zip(self.levels, labels)) + new_codes = FrozenList( + _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() + for lev, level_codes in zip(self.levels, codes)) else: level = [self._get_level_number(l) for l in level] - new_labels = list(self._labels) - for lev_idx, lab in zip(level, labels): + new_codes = list(self._codes) + for lev_idx, level_codes in zip(level, codes): lev = self.levels[lev_idx] - new_labels[lev_idx] = _ensure_frozen( - lab, lev, copy=copy)._shallow_copy() - new_labels = FrozenList(new_labels) + new_codes[lev_idx] = _ensure_frozen( + level_codes, lev, copy=copy)._shallow_copy() + new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(labels=new_labels) + self._verify_integrity(codes=new_codes) - self._labels = new_labels + self._codes = new_codes self._tuples = None self._reset_cache() @@ -662,7 +668,7 @@ def set_labels(self, labels, level=None, inplace=False, else: idx = self._shallow_copy() idx._reset_identity() - idx._set_labels(labels, level=level, verify_integrity=verify_integrity) + idx._set_codes(labels, level=level, verify_integrity=verify_integrity) if not inplace: return idx @@ -801,7 +807,7 @@ def _format_attrs(self): attrs = [ ('levels', ibase.default_pprint(self._levels, max_seq_items=False)), - ('labels', ibase.default_pprint(self._labels, + ('labels', ibase.default_pprint(self._codes, max_seq_items=False))] if com._any_not_none(*self.names): attrs.append(('names', ibase.default_pprint(self.names))) @@ -1602,7 +1608,7 @@ def remove_unused_levels(self): if changed: result._reset_identity() result._set_levels(new_levels, validate=False) - result._set_labels(new_labels, validate=False) + result._set_codes(new_labels, validate=False) return result @@ -1638,7 +1644,7 @@ def __setstate__(self, state): levels, labels, sortorder, names = own_state self._set_levels([Index(x) for x in levels], validate=False) - self._set_labels(labels) + self._set_codes(labels) self._set_names(names) self.sortorder = sortorder self._verify_integrity() diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 4ad20e9d6ee81..cd80660709792 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -67,7 +67,7 @@ def test_constructor_mismatched_label_levels(idx): MultiIndex(levels=levels, labels=labels) length_error = re.compile('>= length of level') - label_error = re.compile(r'Unequal label lengths: \[4, 2\]') + label_error = re.compile(r'Unequal code lengths: \[4, 2\]') # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): From 10dc7c05cfdd22d411d71ea73d9ee483758b6fc1 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 27 Oct 2018 12:09:32 +0100 Subject: [PATCH 02/12] MultiIndex.set_labels -> set_codes --- pandas/core/indexes/multi.py | 37 +++-- pandas/tests/indexes/multi/test_compat.py | 4 +- .../tests/indexes/multi/test_constructor.py | 2 +- pandas/tests/indexes/multi/test_get_set.py | 137 ++++++++++++------ 4 files changed, 118 insertions(+), 62 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4c3aaf6b37f5c..ab032026c957d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -31,6 +31,8 @@ from pandas.io.formats.printing import pprint_thing +from pandas.util._decorators import deprecate_kwarg + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='MultiIndex', @@ -256,8 +258,8 @@ def _verify_integrity(self, codes=None, levels=None): Raises ------ ValueError - If length of levels and codes don't match, if any code would - exceed level bounds, or there are any duplicate levels. + If length of levels and codes don't match, if the codes for any + level would exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -611,14 +613,23 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): + warnings.warn(("set_labels was deprecated in version 0.24.0." + "Use set_codes instead."), + FutureWarning, stacklevel=2) + return self.set_codes(labels, level=level, inplace=inplace, + verify_integrity=verify_integrity) + + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def set_codes(self, codes, level=None, inplace=False, + verify_integrity=True): """ - Set new labels on MultiIndex. Defaults to returning + Set new codes on MultiIndex. Defaults to returning new index. Parameters ---------- - labels : sequence or list of sequence - new labels to apply + codes : sequence or list of sequence + new codes to apply level : int, level name, or sequence of int/level names (default None) level(s) to set (None for all levels) inplace : bool @@ -653,22 +664,22 @@ def set_labels(self, labels, level=None, inplace=False, names=[u'foo', u'bar']) """ if level is not None and not is_list_like(level): - if not is_list_like(labels): - raise TypeError("Labels must be list-like") - if is_list_like(labels[0]): - raise TypeError("Labels must be list-like") + if not is_list_like(codes): + raise TypeError("Codes must be list-like") + if is_list_like(codes[0]): + raise TypeError("Codes must be list-like") level = [level] - labels = [labels] + codes = [codes] elif level is None or is_list_like(level): - if not is_list_like(labels) or not is_list_like(labels[0]): - raise TypeError("Labels must be list of lists-like") + if not is_list_like(codes) or not is_list_like(codes[0]): + raise TypeError("Codes must be list of lists-like") if inplace: idx = self else: idx = self._shallow_copy() idx._reset_identity() - idx._set_codes(labels, level=level, verify_integrity=verify_integrity) + idx._set_codes(codes, level=level, verify_integrity=verify_integrity) if not inplace: return idx diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index 23ea0c306d47c..0353e39935cb6 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -92,7 +92,7 @@ def test_inplace_mutation_resets_values(): # Must be 1d array of tuples assert exp_values.shape == (6,) - new_values = mi2.set_labels(labels2).values + new_values = mi2.set_codes(labels2).values # Not inplace shouldn't change tm.assert_almost_equal(mi2._tuples, vals2) @@ -101,7 +101,7 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should kill _tuples, etc - mi2.set_labels(labels2, inplace=True) + mi2.set_codes(labels2, inplace=True) tm.assert_almost_equal(mi2.values, new_values) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index cd80660709792..75cd536ef2548 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -82,7 +82,7 @@ def test_constructor_mismatched_label_levels(idx): idx.copy().set_levels([['a'], ['b']]) with pytest.raises(ValueError, match=label_error): - idx.copy().set_labels([[0, 0, 0, 0], [0, 0]]) + idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) def test_copy_in_constructor(): diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index a5f586bd98d5f..d534ac717cbee 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -171,7 +171,7 @@ def test_set_levels_labels_directly(idx): def test_set_levels(idx): - # side note - you probably wouldn't want to use levels and labels + # side note - you probably wouldn't want to use levels and codes # directly like this - but it is possible. levels = idx.levels new_levels = [[lev + 'a' for lev in level] for level in levels] @@ -231,9 +231,15 @@ def test_set_levels(idx): assert_matching(idx.levels, original_index.levels, check_dtype=True) +<<<<<<< HEAD with pytest.raises(ValueError, match="^On"): idx.set_labels([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) +======= + with tm.assert_raises_regex(ValueError, "^On"): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, + inplace=inplace) +>>>>>>> MultiIndex.set_labels -> set_codes assert_matching(idx.labels, original_index.labels, check_dtype=True) @@ -242,92 +248,118 @@ def test_set_levels(idx): assert_matching(idx.levels, original_index.levels, check_dtype=True) +<<<<<<< HEAD with pytest.raises(TypeError, match="^Labels"): idx.set_labels(1, level=0, inplace=inplace) +======= + with tm.assert_raises_regex(TypeError, "^Codes"): + idx.set_codes(1, level=0, inplace=inplace) +>>>>>>> MultiIndex.set_labels -> set_codes assert_matching(idx.labels, original_index.labels, check_dtype=True) -def test_set_labels(idx): - # side note - you probably wouldn't want to use levels and labels +def test_set_codes(idx): + # side note - you probably wouldn't want to use levels and codes # directly like this - but it is possible. - labels = idx.labels - major_labels, minor_labels = labels - major_labels = [(x + 1) % 3 for x in major_labels] - minor_labels = [(x + 1) % 1 for x in minor_labels] - new_labels = [major_labels, minor_labels] - - # label changing [w/o mutation] - ind2 = idx.set_labels(new_labels) - assert_matching(ind2.labels, new_labels) - assert_matching(idx.labels, labels) - - # label changing [w/ mutation] + codes = idx.labels + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] + + # changing codes w/o mutation + ind2 = idx.set_codes(new_codes) + assert_matching(ind2.labels, new_codes) + assert_matching(idx.labels, codes) + + # changing label w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_labels(new_labels, inplace=True) + inplace_return = ind2.set_codes(new_codes, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, new_labels) + assert_matching(ind2.labels, new_codes) - # label changing specific level [w/o mutation] - ind2 = idx.set_labels(new_labels[0], level=0) - assert_matching(ind2.labels, [new_labels[0], labels[1]]) - assert_matching(idx.labels, labels) + # codes changing specific level w/o mutation + ind2 = idx.set_codes(new_codes[0], level=0) + assert_matching(ind2.labels, [new_codes[0], codes[1]]) + assert_matching(idx.labels, codes) - ind2 = idx.set_labels(new_labels[1], level=1) - assert_matching(ind2.labels, [labels[0], new_labels[1]]) - assert_matching(idx.labels, labels) + ind2 = idx.set_codes(new_codes[1], level=1) + assert_matching(ind2.labels, [codes[0], new_codes[1]]) + assert_matching(idx.labels, codes) - # label changing multiple levels [w/o mutation] - ind2 = idx.set_labels(new_labels, level=[0, 1]) - assert_matching(ind2.labels, new_labels) - assert_matching(idx.labels, labels) + # codes changing multiple levels w/o mutation + ind2 = idx.set_codes(new_codes, level=[0, 1]) + assert_matching(ind2.labels, new_codes) + assert_matching(idx.labels, codes) - # label changing specific level [w/ mutation] + # label changing specific level w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_labels(new_labels[0], level=0, inplace=True) + inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, [new_labels[0], labels[1]]) - assert_matching(idx.labels, labels) + assert_matching(ind2.labels, [new_codes[0], codes[1]]) + assert_matching(idx.labels, codes) ind2 = idx.copy() - inplace_return = ind2.set_labels(new_labels[1], level=1, inplace=True) + inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, [labels[0], new_labels[1]]) - assert_matching(idx.labels, labels) + assert_matching(ind2.labels, [codes[0], new_codes[1]]) + assert_matching(idx.labels, codes) - # label changing multiple levels [w/ mutation] + # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_labels(new_labels, level=[0, 1], - inplace=True) + inplace_return = ind2.set_codes(new_codes, level=[0, 1], + inplace=True) assert inplace_return is None - assert_matching(ind2.labels, new_labels) - assert_matching(idx.labels, labels) + assert_matching(ind2.labels, new_codes) + assert_matching(idx.labels, codes) # label changing for levels of different magnitude of categories + ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) + new_codes = range(129, -1, -1) + expected = pd.MultiIndex.from_tuples( + [(0, i) for i in new_codes]) + + # [w/o mutation] + result = ind.set_codes(codes=new_codes, level=1) + assert result.equals(expected) + + # [w/ mutation] + result = ind.copy() + result.set_codes(codes=new_codes, level=1, inplace=True) + assert result.equals(expected) + + with tm.assert_produces_warning(FutureWarning): + ind.set_codes(labels=new_codes, level=1) + + +def test_set_labels_deprecated(): ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_labels = range(129, -1, -1) expected = pd.MultiIndex.from_tuples( [(0, i) for i in new_labels]) # [w/o mutation] - result = ind.set_labels(labels=new_labels, level=1) + with tm.assert_produces_warning(FutureWarning): + result = ind.set_labels(labels=new_labels, level=1) assert result.equals(expected) # [w/ mutation] result = ind.copy() - result.set_labels(labels=new_labels, level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + result.set_labels(labels=new_labels, level=1, inplace=True) assert result.equals(expected) -def test_set_levels_labels_names_bad_input(idx): - levels, labels = idx.levels, idx.labels +def test_set_levels_codes_names_bad_input(idx): + levels, codes = idx.levels, idx.labels names = idx.names with pytest.raises(ValueError, match='Length of levels'): idx.set_levels([levels[0]]) - with pytest.raises(ValueError, match='Length of labels'): - idx.set_labels([labels[0]]) + with tm.assert_raises_regex(ValueError, 'Length of codes'): + idx.set_codes([codes[0]]) with pytest.raises(ValueError, match='Length of names'): idx.set_names([names[0]]) @@ -337,8 +369,13 @@ def test_set_levels_labels_names_bad_input(idx): idx.set_levels(levels[0]) # shouldn't scalar data error, instead should demand list-like +<<<<<<< HEAD with pytest.raises(TypeError, match='list of lists-like'): idx.set_labels(labels[0]) +======= + with tm.assert_raises_regex(TypeError, 'list of lists-like'): + idx.set_codes(codes[0]) +>>>>>>> MultiIndex.set_labels -> set_codes # shouldn't scalar data error, instead should demand list-like with pytest.raises(TypeError, match='list-like'): @@ -352,11 +389,19 @@ def test_set_levels_labels_names_bad_input(idx): idx.set_levels(levels, level=0) # should have equal lengths +<<<<<<< HEAD with pytest.raises(TypeError, match='list of lists-like'): idx.set_labels(labels[0], level=[0, 1]) with pytest.raises(TypeError, match='list-like'): idx.set_labels(labels, level=0) +======= + with tm.assert_raises_regex(TypeError, 'list of lists-like'): + idx.set_codes(codes[0], level=[0, 1]) + + with tm.assert_raises_regex(TypeError, 'list-like'): + idx.set_codes(codes, level=0) +>>>>>>> MultiIndex.set_labels -> set_codes # should have equal lengths with pytest.raises(ValueError, match='Length of names'): From 94a6c44f6efd2de86467c1bb598a738702532cea Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 27 Oct 2018 19:24:28 +0100 Subject: [PATCH 03/12] MultiIndex.labels -> MulitIndex.codes --- pandas/core/frame.py | 13 +- pandas/core/groupby/generic.py | 6 +- pandas/core/groupby/ops.py | 4 +- pandas/core/indexes/multi.py | 178 +- pandas/core/panel.py | 30 +- pandas/core/reshape/concat.py | 24 +- pandas/core/reshape/merge.py | 28 +- pandas/core/reshape/reshape.py | 88 +- pandas/core/series.py | 8 +- pandas/core/sparse/frame.py | 6 +- pandas/core/util/hashing.py | 2 +- pandas/core/window.py | 2 +- pandas/io/formats/excel.py | 14 +- pandas/io/pytables.py | 16 +- pandas/tests/frame/test_alter_axes.py | 26 +- pandas/tests/frame/test_analytics.py | 6 +- pandas/tests/frame/test_indexing.py | 2 +- pandas/tests/frame/test_reshape.py | 34 +- pandas/tests/groupby/conftest.py | 4 +- pandas/tests/groupby/test_categorical.py | 4 +- pandas/tests/groupby/test_counting.py | 2 +- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_groupby.py | 10 +- pandas/tests/groupby/test_grouping.py | 8 +- pandas/tests/groupby/test_whitelist.py | 8 +- pandas/tests/indexes/multi/conftest.py | 12 +- pandas/tests/indexes/multi/test_analytics.py | 18 +- pandas/tests/indexes/multi/test_astype.py | 2 +- pandas/tests/indexes/multi/test_compat.py | 12 +- .../tests/indexes/multi/test_constructor.py | 62 +- pandas/tests/indexes/multi/test_contains.py | 2 +- pandas/tests/indexes/multi/test_conversion.py | 10 +- pandas/tests/indexes/multi/test_copy.py | 14 +- pandas/tests/indexes/multi/test_drop.py | 4 +- pandas/tests/indexes/multi/test_duplicates.py | 36 +- .../tests/indexes/multi/test_equivalence.py | 16 +- pandas/tests/indexes/multi/test_format.py | 9 +- pandas/tests/indexes/multi/test_get_set.py | 94 +- pandas/tests/indexes/multi/test_indexing.py | 20 +- pandas/tests/indexes/multi/test_integrity.py | 32 +- pandas/tests/indexes/multi/test_missing.py | 4 +- pandas/tests/indexes/multi/test_monotonic.py | 24 +- pandas/tests/indexes/multi/test_names.py | 6 +- pandas/tests/indexes/multi/test_sorting.py | 8 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexing/test_multiindex.py | 2249 +++++++++++++++++ pandas/tests/internals/test_internals.py | 4 +- pandas/tests/io/formats/test_to_csv.py | 4 +- pandas/tests/io/formats/test_to_html.py | 4 +- pandas/tests/io/parser/header.py | 407 +++ pandas/tests/io/parser/index_col.py | 171 ++ pandas/tests/io/test_excel.py | 6 +- pandas/tests/io/test_html.py | 4 +- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/test_concat.py | 20 +- pandas/tests/reshape/test_pivot.py | 14 +- pandas/tests/reshape/test_reshape.py | 2 +- pandas/tests/series/indexing/test_indexing.py | 4 +- pandas/tests/series/test_alter_axes.py | 12 +- pandas/tests/series/test_analytics.py | 12 +- pandas/tests/series/test_repr.py | 4 +- pandas/tests/series/test_timeseries.py | 4 +- pandas/tests/test_multilevel.py | 60 +- pandas/tests/test_panel.py | 16 +- pandas/util/testing.py | 2 +- 65 files changed, 3366 insertions(+), 553 deletions(-) create mode 100644 pandas/tests/indexing/test_multiindex.py create mode 100644 pandas/tests/io/parser/header.py create mode 100644 pandas/tests/io/parser/index_col.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c1fa5ef4439e..45e4f4e0261f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1839,7 +1839,7 @@ def to_panel(self): selfsorted = self major_axis, minor_axis = selfsorted.index.levels - major_labels, minor_labels = selfsorted.index.labels + major_codes, minor_codes = selfsorted.index.codes shape = len(major_axis), len(minor_axis) # preserve names, if any @@ -1854,8 +1854,8 @@ def to_panel(self): # create new manager new_mgr = selfsorted._data.reshape_nd(axes=new_axes, - labels=[major_labels, - minor_labels], + labels=[major_codes, + minor_codes], shape=shape, ref_items=selfsorted.columns) @@ -4226,7 +4226,7 @@ def _maybe_casted_values(index, labels=None): if isinstance(self.index, MultiIndex): names = [n if n is not None else ('level_%d' % i) for (i, n) in enumerate(self.index.names)] - to_insert = lzip(self.index.levels, self.index.labels) + to_insert = lzip(self.index.levels, self.index.codes) else: default = 'index' if 'index' not in self else 'level_0' names = ([default] if self.index.name is None @@ -7147,8 +7147,9 @@ def _count_level(self, level, axis=0, numeric_only=False): level = count_axis._get_level_number(level) level_index = count_axis.levels[level] - labels = ensure_int64(count_axis.labels[level]) - counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) + level_codes = ensure_int64(count_axis.codes[level]) + counts = lib.count_level_2d(mask, level_codes, len(level_index), + axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a17e2ce7f1ef5..95bf50fc4ca4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1112,7 +1112,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, lab = cut(Series(val), bins, include_lowest=True) lev = lab.cat.categories lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.labels[-1] + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] if is_interval_dtype(lab): # TODO: should we do this inside II? @@ -1163,7 +1163,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels[-1] = out[sorter], labels[-1][sorter] if bins is None: - mi = MultiIndex(levels=levels, labels=labels, names=names, + mi = MultiIndex(levels=levels, codes=labels, names=names, verify_integrity=False) if is_integer_dtype(out): @@ -1194,7 +1194,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) labels.append(left[-1]) - mi = MultiIndex(levels=levels, labels=labels, names=names, + mi = MultiIndex(levels=levels, codes=labels, names=names, verify_integrity=False) if is_integer_dtype(out): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8455c03953ad1..87f48d5a40554 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -290,10 +290,10 @@ def result_index(self): if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) - labels = self.recons_labels + codes = self.recons_labels levels = [ping.result_index for ping in self.groupings] result = MultiIndex(levels=levels, - labels=labels, + codes=codes, verify_integrity=False, names=self.names) return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ab032026c957d..90c4d4f7dd21b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -210,25 +210,26 @@ class MultiIndex(Index): # -------------------------------------------------------------------- # Constructors - def __new__(cls, levels=None, labels=None, sortorder=None, names=None, + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def __new__(cls, levels=None, codes=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True, _set_identity=True): # compat with Index if name is not None: names = name - if levels is None or labels is None: - raise TypeError("Must pass both levels and labels") - if len(levels) != len(labels): - raise ValueError('Length of levels and labels must be the same.') + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + if len(levels) != len(codes): + raise ValueError('Length of levels and codes must be the same.') if len(levels) == 0: - raise ValueError('Must pass non-zero number of levels/labels') + raise ValueError('Must pass non-zero number of levels/codes') result = object.__new__(MultiIndex) # we've already validated levels and labels, so shortcut here result._set_levels(levels, copy=copy, validate=False) - result._set_codes(labels, copy=copy, validate=False) + result._set_codes(codes, copy=copy, validate=False) if names is not None: # handles name validation @@ -263,13 +264,13 @@ def _verify_integrity(self, codes=None, levels=None): """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. - codes = codes or self.labels + codes = codes or self.codes levels = levels or self.levels if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.labels[0]) + codes_length = len(self.codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % @@ -527,7 +528,7 @@ def set_levels(self, levels, level=None, inplace=False, inplace : bool if True, mutates in place verify_integrity : bool (default True) - if True, checks that levels and labels are compatible + if True, checks that levels and codes are compatible Returns ------- @@ -580,7 +581,7 @@ def set_levels(self, levels, level=None, inplace=False, return idx @property - def labels(self): + def codes(self): return self._codes def _set_codes(self, codes, level=None, copy=False, validate=True, @@ -635,7 +636,7 @@ def set_codes(self, codes, level=None, inplace=False, inplace : bool if True, mutates in place verify_integrity : bool (default True) - if True, checks that levels and labels are compatible + if True, checks that levels and codes are compatible Returns ------- @@ -714,13 +715,13 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, if levels is None: levels = deepcopy(self.levels) if labels is None: - labels = deepcopy(self.labels) + labels = deepcopy(self.codes) else: if levels is None: levels = self.levels if labels is None: - labels = self.labels - return MultiIndex(levels=levels, labels=labels, names=names, + labels = self.codes + return MultiIndex(levels=levels, codes=labels, names=names, sortorder=self.sortorder, verify_integrity=False, _set_identity=_set_identity) @@ -739,7 +740,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): # Therefore, an empty MultiIndex is returned GH13490 if len(values) == 0: return MultiIndex(levels=[[] for _ in range(self.nlevels)], - labels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], **kwargs) return self._shallow_copy(values, **kwargs) @@ -800,7 +801,7 @@ def _nbytes(self, deep=False): objsize = 24 level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) - label_nbytes = sum(i.nbytes for i in self.labels) + label_nbytes = sum(i.nbytes for i in self.codes) names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes @@ -835,26 +836,26 @@ def _format_data(self, name=None): def _format_native_types(self, na_rep='nan', **kwargs): new_levels = [] - new_labels = [] + new_codes = [] # go through the levels and format them - for level, label in zip(self.levels, self.labels): + for level, level_codes in zip(self.levels, self.codes): level = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any - mask = (label == -1) + mask = (level_codes == -1) if mask.any(): nan_index = len(level) level = np.append(level, na_rep) - label = label.values() - label[mask] = nan_index + level_codes = level_codes.values() + level_codes[mask] = nan_index new_levels.append(level) - new_labels.append(label) + new_codes.append(level_codes) if len(new_levels) == 1: return Index(new_levels[0])._format_native_types() else: # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, labels=new_labels, + mi = MultiIndex(levels=new_levels, codes=new_codes, names=self.names, sortorder=self.sortorder, verify_integrity=False) return mi.values @@ -922,7 +923,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, # -------------------------------------------------------------------- def __len__(self): - return len(self.labels[0]) + return len(self.codes[0]) def _get_names(self): return FrozenList(level.name for level in self.levels) @@ -984,7 +985,7 @@ def _set_names(self, names, level=None, validate=True): @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): - indexer = self.labels[level] + indexer = self.codes[level] level_index = self.levels[level] if mapper is not None: @@ -993,25 +994,24 @@ def _get_grouper_for_level(self, mapper, level): grouper = level_values.map(mapper) return grouper, None, None - labels, uniques = algos.factorize(indexer, sort=True) + codes, uniques = algos.factorize(indexer, sort=True) if len(uniques) > 0 and uniques[0] == -1: # Handle NAs mask = indexer != -1 - ok_labels, uniques = algos.factorize(indexer[mask], - sort=True) + ok_codes, uniques = algos.factorize(indexer[mask], sort=True) - labels = np.empty(len(indexer), dtype=indexer.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 + codes = np.empty(len(indexer), dtype=indexer.dtype) + codes[mask] = ok_codes + codes[~mask] = -1 if len(uniques) < len(level_index): # Remove unobserved levels from level_index level_index = level_index.take(uniques) - grouper = level_index.take(labels) + grouper = level_index.take(codes) - return grouper, labels, level_index + return grouper, codes, level_index @property def _constructor(self): @@ -1065,8 +1065,8 @@ def _engine(self): # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.labels, offsets) - return MultiIndexUIntEngine(self.levels, self.labels, offsets) + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) @property def values(self): @@ -1177,7 +1177,7 @@ def duplicated(self, keep='first'): from pandas._libs.hashtable import duplicated_int64 shape = map(len, self.levels) - ids = get_group_index(self.labels, shape, sort=False, xnull=False) + ids = get_group_index(self.codes, shape, sort=False, xnull=False) return duplicated_int64(ids, keep) @@ -1189,7 +1189,7 @@ def fillna(self, value=None, downcast=None): @Appender(_index_shared_docs['dropna']) def dropna(self, how='any'): - nans = [label == -1 for label in self.labels] + nans = [level_codes == -1 for level_codes in self.codes] if how == 'any': indexer = np.any(nans, axis=0) elif how == 'all': @@ -1197,8 +1197,8 @@ def dropna(self, how='any'): else: raise ValueError("invalid how option: {0}".format(how)) - new_labels = [label[~indexer] for label in self.labels] - return self.copy(labels=new_labels, deep=True) + new_codes = [label[~indexer] for label in self.codes] + return self.copy(labels=new_codes, deep=True) def get_value(self, series, key): # somewhat broken encapsulation @@ -1279,10 +1279,10 @@ def _get_level_values(self, level, unique=False): """ values = self.levels[level] - labels = self.labels[level] + level_codes = self.codes[level] if unique: - labels = algos.unique(labels) - filled = algos.take_1d(values._values, labels, + level_codes = algos.unique(level_codes) + filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) values = values._shallow_copy(filled) return values @@ -1418,14 +1418,15 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) """ levels = self.levels - labels = [np.repeat(x, n_repeat) for x in self.labels] - # Assumes that each label is divisible by n_shuffle - labels = [x.reshape(n_shuffle, -1).ravel(order='F') for x in labels] + codes = [np.repeat(level_codes, n_repeat) for + level_codes in self.codes] + # Assumes that each level_codes is divisible by n_shuffle + codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes] names = self.names warnings.warn("Method .to_hierarchical is deprecated and will " "be removed in a future version", FutureWarning, stacklevel=2) - return MultiIndex(levels=levels, labels=labels, names=names) + return MultiIndex(levels=levels, codes=codes, names=names) def to_flat_index(self): """ @@ -1461,7 +1462,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the labels are lexicographically sorted + Return True if the codes are lexicographically sorted """ return self.lexsort_depth == self.nlevels @@ -1473,7 +1474,7 @@ def lexsort_depth(self): else: return 0 - int64_labels = [ensure_int64(lab) for lab in self.labels] + int64_labels = [ensure_int64(lab) for lab in self.codes] for k in range(self.nlevels, 0, -1): if libalgos.is_lexsorted(int64_labels[:k]): return k @@ -1519,7 +1520,7 @@ def _sort_levels_monotonic(self): new_levels = [] new_labels = [] - for lev, lab in zip(self.levels, self.labels): + for lev, lab in zip(self.levels, self.codes): if not lev.is_monotonic: try: @@ -1579,7 +1580,7 @@ def remove_unused_levels(self): new_labels = [] changed = False - for lev, lab in zip(self.levels, self.labels): + for lev, lab in zip(self.levels, self.codes): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values @@ -1636,7 +1637,7 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], - labels=[label for label in self.labels], + labels=[label for label in self.codes], sortorder=self.sortorder, names=list(self.names)) return ibase._new_Index, (self.__class__, d), None @@ -1666,7 +1667,7 @@ def __getitem__(self, key): key = com.cast_scalar_indexer(key) retval = [] - for lev, lab in zip(self.levels, self.labels): + for lev, lab in zip(self.levels, self.codes): if lab[key] == -1: retval.append(np.nan) else: @@ -1684,9 +1685,9 @@ def __getitem__(self, key): if isinstance(key, Index): key = np.asarray(key) - new_labels = [lab[key] for lab in self.labels] + new_labels = [lab[key] for lab in self.codes] - return MultiIndex(levels=self.levels, labels=new_labels, + return MultiIndex(levels=self.levels, codes=new_labels, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -1695,11 +1696,11 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.labels, indices, + taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=-1) - return MultiIndex(levels=self.levels, labels=taken, + return MultiIndex(levels=self.levels, codes=taken, names=self.names, verify_integrity=False) def _assert_take_fillable(self, values, indices, allow_fill=True, @@ -1711,7 +1712,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = [lab.take(indices) for lab in self.labels] + taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 if mask.any(): masked = [] @@ -1721,7 +1722,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, masked.append(np.asarray(label_values)) taken = masked else: - taken = [lab.take(indices) for lab in self.labels] + taken = [lab.take(indices) for lab in self.codes] return taken def append(self, other): @@ -1763,9 +1764,10 @@ def argsort(self, *args, **kwargs): def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return MultiIndex(levels=self.levels, - labels=[label.view(np.ndarray).repeat(repeats) - for label in self.labels], names=self.names, - sortorder=self.sortorder, verify_integrity=False) + codes=[level_codes.view(np.ndarray).repeat(repeats) + for level_codes in self.codes], + names=self.names, sortorder=self.sortorder, + verify_integrity=False) def where(self, cond, other=None): raise NotImplementedError(".where is not supported for " @@ -1834,7 +1836,7 @@ def _drop_from_level(self, labels, level): index = self.levels[i] values = index.get_indexer(labels) - mask = ~algos.isin(self.labels[i], values) + mask = ~algos.isin(self.codes[i], values) return self[mask] @@ -1881,7 +1883,7 @@ def swaplevel(self, i=-2, j=-1): labels=[[0, 1, 0, 1], [0, 0, 1, 1]]) """ new_levels = list(self.levels) - new_labels = list(self.labels) + new_labels = list(self.codes) new_names = list(self.names) i = self._get_level_number(i) @@ -1891,7 +1893,7 @@ def swaplevel(self, i=-2, j=-1): new_labels[i], new_labels[j] = new_labels[j], new_labels[i] new_names[i], new_names[j] = new_names[j], new_names[i] - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_labels, names=new_names, verify_integrity=False) def reorder_levels(self, order): @@ -1907,10 +1909,10 @@ def reorder_levels(self, order): 'number of levels (%d), got %d' % (self.nlevels, len(order))) new_levels = [self.levels[i] for i in order] - new_labels = [self.labels[i] for i in order] + new_labels = [self.codes[i] for i in order] new_names = [self.names[i] for i in order] - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_labels, names=new_names, verify_integrity=False) def __getslice__(self, i, j): @@ -1931,7 +1933,7 @@ def cats(label): dtype=label.dtype) return [Categorical.from_codes(label, cats(label), ordered=True) - for label in self.labels] + for label in self.codes] def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -1968,13 +1970,13 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): raise ValueError("level must have same length as ascending") from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer([self.labels[lev] for lev in level], + indexer = lexsort_indexer([self.codes[lev] for lev in level], orders=ascending) # level ordering else: - labels = list(self.labels) + labels = list(self.codes) shape = list(self.levshape) # partition labels and shape @@ -1994,9 +1996,9 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): indexer = indexer[::-1] indexer = ensure_platform_int(indexer) - new_labels = [lab.take(indexer) for lab in self.labels] + new_labels = [lab.take(indexer) for lab in self.codes] - new_index = MultiIndex(labels=new_labels, levels=self.levels, + new_index = MultiIndex(codes=new_labels, levels=self.levels, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -2211,7 +2213,7 @@ def _partial_tup_index(self, tup, side='left'): n = len(tup) start, end = 0, len(self) - zipped = zip(tup, self.levels, self.labels) + zipped = zip(tup, self.levels, self.codes) for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] @@ -2323,7 +2325,7 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype='int64') for i, k in enumerate(follow_key, len(lead_key)): - mask = self.labels[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self.levels[i].get_loc(k) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2474,7 +2476,7 @@ def _get_level_indexer(self, key, level=0, indexer=None): # if the indexer is provided, then use this level_index = self.levels[level] - labels = self.labels[level] + labels = self.codes[level] def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # given the inputs and the labels/indexer, compute an indexer set @@ -2706,10 +2708,10 @@ def truncate(self, before=None, after=None): new_levels = list(self.levels) new_levels[0] = new_levels[0][i:j] - new_labels = [lab[left:right] for lab in self.labels] + new_labels = [lab[left:right] for lab in self.codes] new_labels[0] = new_labels[0] - i - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_labels, verify_integrity=False) def equals(self, other): @@ -2738,12 +2740,12 @@ def equals(self, other): return False for i in range(self.nlevels): - slabels = self.labels[i] + slabels = self.codes[i] slabels = slabels[slabels != -1] svalues = algos.take_nd(np.asarray(self.levels[i]._values), slabels, allow_fill=False) - olabels = other.labels[i] + olabels = other.codes[i] olabels = olabels[olabels != -1] ovalues = algos.take_nd( np.asarray(other.levels[i]._values), @@ -2823,7 +2825,7 @@ def intersection(self, other): uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=self.levels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) else: return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, @@ -2853,7 +2855,7 @@ def difference(self, other, sort=True): if self.equals(other): return MultiIndex(levels=self.levels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) this = self._get_unique_index() @@ -2869,7 +2871,7 @@ def difference(self, other, sort=True): if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, names=result_names, verify_integrity=False) else: return MultiIndex.from_tuples(difference, sortorder=0, @@ -2895,7 +2897,7 @@ def _convert_can_do_setop(self, other): if not hasattr(other, 'names'): if len(other) == 0: other = MultiIndex(levels=[[]] * self.nlevels, - labels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, verify_integrity=False) else: msg = 'other must be a MultiIndex or a list of tuples' @@ -2931,7 +2933,7 @@ def insert(self, loc, item): new_levels = [] new_labels = [] - for k, level, labels in zip(item, self.levels, self.labels): + for k, level, labels in zip(item, self.levels, self.codes): if k not in level: # have to insert into level # must insert at end otherwise you have to recompute all the @@ -2944,7 +2946,7 @@ def insert(self, loc, item): new_levels.append(level) new_labels.append(np.insert(ensure_int64(labels), loc, lev_loc)) - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_labels, names=self.names, verify_integrity=False) def delete(self, loc): @@ -2955,8 +2957,8 @@ def delete(self, loc): ------- new_index : MultiIndex """ - new_labels = [np.delete(lab, loc) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels, + new_labels = [np.delete(lab, loc) for lab in self.codes] + return MultiIndex(levels=self.levels, codes=new_labels, names=self.names, verify_integrity=False) def _wrap_joined_index(self, joined, other): @@ -2972,7 +2974,7 @@ def isin(self, values, level=None): else: num = self._get_level_number(level) levs = self.levels[num] - labs = self.labels[num] + labs = self.codes[num] sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: diff --git a/pandas/core/panel.py b/pandas/core/panel.py index bfa00d1352401..bb3412a3d7c0c 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -953,46 +953,46 @@ def to_frame(self, filter_observations=True): def construct_multi_parts(idx, n_repeat, n_shuffle=1): # Replicates and shuffles MultiIndex, returns individual attributes - labels = [np.repeat(x, n_repeat) for x in idx.labels] + codes = [np.repeat(x, n_repeat) for x in idx.codes] # Assumes that each label is divisible by n_shuffle - labels = [x.reshape(n_shuffle, -1).ravel(order='F') - for x in labels] - labels = [x[selector] for x in labels] + codes = [x.reshape(n_shuffle, -1).ravel(order='F') + for x in codes] + codes = [x[selector] for x in codes] levels = idx.levels names = idx.names - return labels, levels, names + return codes, levels, names def construct_index_parts(idx, major=True): levels = [idx] if major: - labels = [np.arange(N).repeat(K)[selector]] + codes = [np.arange(N).repeat(K)[selector]] names = idx.name or 'major' else: - labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] - labels = [labels.ravel()[selector]] + codes = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] + codes = [codes.ravel()[selector]] names = idx.name or 'minor' names = [names] - return labels, levels, names + return codes, levels, names if isinstance(self.major_axis, MultiIndex): - major_labels, major_levels, major_names = construct_multi_parts( + major_codes, major_levels, major_names = construct_multi_parts( self.major_axis, n_repeat=K) else: - major_labels, major_levels, major_names = construct_index_parts( + major_codes, major_levels, major_names = construct_index_parts( self.major_axis) if isinstance(self.minor_axis, MultiIndex): - minor_labels, minor_levels, minor_names = construct_multi_parts( + minor_codes, minor_levels, minor_names = construct_multi_parts( self.minor_axis, n_repeat=N, n_shuffle=K) else: - minor_labels, minor_levels, minor_names = construct_index_parts( + minor_codes, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) levels = major_levels + minor_levels - labels = major_labels + minor_labels + codes = major_codes + minor_codes names = major_names + minor_names - index = MultiIndex(levels=levels, labels=labels, names=names, + index = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) return DataFrame(data, index=index, columns=self.items) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index f01c9d29fd457..b13b22d2e8266 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -555,9 +555,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): - label_list = [] + codes_list = [] - # things are potentially different sizes, so compute the exact labels + # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): @@ -570,18 +570,18 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): .format(key=key, level=level)) to_concat.append(np.repeat(i, len(index))) - label_list.append(np.concatenate(to_concat)) + codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) - label_list.extend(concat_index.labels) + codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) - label_list.append(codes) + codes_list.append(codes) if len(names) == len(levels): names = list(names) @@ -594,7 +594,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): # also copies names = names + _get_consensus_names(indexes) - return MultiIndex(levels=levels, labels=label_list, names=names, + return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] @@ -605,8 +605,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): new_names = list(names) new_levels = list(levels) - # construct labels - new_labels = [] + # construct codes + new_codes = [] # do something a bit more speedy @@ -619,17 +619,17 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): raise ValueError('Values not found in passed level: {hlevel!s}' .format(hlevel=hlevel[mask])) - new_labels.append(np.repeat(mapped, n)) + new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) + new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) - new_labels.append(np.tile(np.arange(n), kpieces)) + new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) - return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b078ff32f6944..4bb5469fa411d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -857,9 +857,9 @@ def _get_merge_keys(self): left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): - right_keys = [lev._values.take(lab) - for lev, lab in zip(self.right.index.levels, - self.right.index.labels)] + right_keys = [lev._values.take(lev_codes) for lev, lev_codes + in zip(self.right.index.levels, + self.right.index.codes)] else: right_keys = [self.right.index.values] elif _any(self.right_on): @@ -871,9 +871,9 @@ def _get_merge_keys(self): right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): - left_keys = [lev._values.take(lab) - for lev, lab in zip(self.left.index.levels, - self.left.index.labels)] + left_keys = [lev._values.take(lev_codes) for lev, lev_codes + in zip(self.left.index.levels, + self.left.index.codes)] else: left_keys = [self.left.index.values] @@ -1508,27 +1508,29 @@ def _get_multiindex_indexer(join_keys, index, sort): fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) + rcodes, lcodes, shape = map(list, zip(* map(fkeys, + index.levels, + join_keys))) if sort: - rlab = list(map(np.take, rlab, index.labels)) + rcodes = list(map(np.take, rcodes, index.codes)) else: i8copy = lambda a: a.astype('i8', subok=False, copy=True) - rlab = list(map(i8copy, index.labels)) + rcodes = list(map(i8copy, index.codes)) # fix right labels if there were any nulls for i in range(len(join_keys)): - mask = index.labels[i] == -1 + mask = index.codes[i] == -1 if mask.any(): # check if there already was any nulls at this location # if there was, it is factorized to `shape[i] - 1` - a = join_keys[i][llab[i] == shape[i] - 1] + a = join_keys[i][lcodes[i] == shape[i] - 1] if a.size == 0 or not a[0] != a[0]: shape[i] += 1 - rlab[i][mask] = shape[i] - 1 + rcodes[i][mask] = shape[i] - 1 # get flat i8 join keys - lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 065728fb239ae..ba86d3d9ba25f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -100,7 +100,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 - self.lift = 1 if -1 in self.index.labels[self.level] else 0 + self.lift = 1 if -1 in self.index.codes[self.level] else 0 self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) @@ -115,9 +115,9 @@ def __init__(self, values, index, level=-1, value_columns=None, def _make_sorted_values_labels(self): v = self.level - labs = list(self.index.labels) + codes = list(self.index.codes) levs = list(self.index.levels) - to_sort = labs[:v] + labs[v + 1:] + [labs[v]] + to_sort = codes[:v] + codes[v + 1:] + [codes[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) @@ -243,16 +243,16 @@ def get_new_columns(self): new_levels = self.value_columns.levels + (self.removed_level_full,) new_names = self.value_columns.names + (self.removed_name,) - new_labels = [lab.take(propagator) - for lab in self.value_columns.labels] + new_codes = [lab.take(propagator) + for lab in self.value_columns.codes] else: new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] - new_labels = [propagator] + new_codes = [propagator] # The two indices differ only if the unstacked level had unused items: if len(self.removed_level_full) != len(self.removed_level): - # In this case, we remap the new labels to the original level: + # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: repeater = np.insert(repeater, 0, -1) @@ -261,22 +261,22 @@ def get_new_columns(self): repeater = np.arange(stride) - self.lift # The entire level is then just a repetition of the single chunk: - new_labels.append(np.tile(repeater, width)) - return MultiIndex(levels=new_levels, labels=new_labels, + new_codes.append(np.tile(repeater, width)) + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def get_new_index(self): - result_labels = [lab.take(self.compressor) - for lab in self.sorted_labels[:-1]] + result_codes = [lab.take(self.compressor) + for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: - lev, lab = self.new_index_levels[0], result_labels[0] + lev, lab = self.new_index_levels[0], result_codes[0] if (lab == -1).any(): lev = lev.insert(len(lev), lev._na_value) return lev.take(lab) - return MultiIndex(levels=self.new_index_levels, labels=result_labels, + return MultiIndex(levels=self.new_index_levels, codes=result_codes, names=self.new_index_names, verify_integrity=False) @@ -293,25 +293,25 @@ def _unstack_multiple(data, clocs, fill_value=None): rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] - clabels = [index.labels[i] for i in clocs] + ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] - rlabels = [index.labels[i] for i in rlocs] + rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] - group_index = get_group_index(clabels, shape, sort=False, xnull=False) + group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) - recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, - xnull=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, + xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name='__placeholder__') else: dummy_index = MultiIndex(levels=rlevels + [obs_ids], - labels=rlabels + [comp_ids], + codes=rcodes + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) @@ -322,7 +322,7 @@ def _unstack_multiple(data, clocs, fill_value=None): unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames - new_labels = recons_labels + new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data @@ -344,11 +344,11 @@ def _unstack_multiple(data, clocs, fill_value=None): new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames - new_labels = [unstcols.labels[0]] - for rec in recons_labels: - new_labels.append(rec.take(unstcols.labels[-1])) + new_codes = [unstcols.codes[0]] + for rec in recons_codes: + new_codes.append(rec.take(unstcols.codes[-1])) - new_columns = MultiIndex(levels=new_levels, labels=new_labels, + new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): @@ -467,21 +467,21 @@ def factorize(index): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) - new_labels = [lab.repeat(K) for lab in frame.index.labels] + new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) - new_labels.append(np.tile(clab, N).ravel()) + new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) - new_index = MultiIndex(levels=new_levels, labels=new_labels, + new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) - labels = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, labels=labels, + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex(levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False) @@ -592,9 +592,9 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list(zip(*[lev.take(lab) - for lev, lab in zip(this.columns.levels[:-1], - this.columns.labels[:-1])])) + tuples = list(zip(*[lev.take(level_codes) for lev, level_codes + in zip(this.columns.levels[:-1], + this.columns.codes[:-1])])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -604,9 +604,9 @@ def _convert_level_number(level_num, columns): # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] - level_labels = sorted(set(this.columns.labels[-1])) - level_vals_used = level_vals[level_labels] - levsize = len(level_labels) + level_codes = sorted(set(this.columns.codes[-1])) + level_vals_used = level_vals[level_codes] + levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: @@ -625,8 +625,8 @@ def _convert_level_number(level_num, columns): slice_len = loc.stop - loc.start if slice_len != levsize: - chunk = this[this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.labels[-1]) + chunk = this.loc[:, this.columns[loc]] + chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if (frame._is_homogeneous_type and @@ -660,17 +660,17 @@ def _convert_level_number(level_num, columns): if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) - new_labels = [lab.repeat(levsize) for lab in this.index.labels] + new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: new_levels = [this.index] - new_labels = [np.arange(N).repeat(levsize)] + new_codes = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) - new_labels.append(np.tile(level_labels, N)) + new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) - new_index = MultiIndex(levels=new_levels, labels=new_labels, + new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) result = frame._constructor(new_data, index=new_index, columns=new_columns) @@ -979,13 +979,13 @@ def make_axis_dummies(frame, axis='minor', transform=None): num = numbers.get(axis, axis) items = frame.index.levels[num] - labels = frame.index.labels[num] + codes = frame.index.codes[num] if transform is not None: mapped_items = items.map(transform) - labels, items = _factorize_from_iterable(mapped_items.take(labels)) + codes, items = _factorize_from_iterable(mapped_items.take(codes)) values = np.eye(len(items), dtype=float) - values = values.take(labels, axis=0) + values = values.take(codes, axis=0) return DataFrame(values, columns=items, index=frame.index) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6b4c9927ef0f1..b4b17b43f242c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1463,14 +1463,14 @@ def count(self, level=None): level = self.index._get_level_number(level) lev = self.index.levels[level] - lab = np.array(self.index.labels[level], subok=False, copy=True) + level_codes = np.array(self.index.codes[level], subok=False, copy=True) - mask = lab == -1 + mask = level_codes == -1 if mask.any(): - lab[mask] = cnt = len(lev) + level_codes[mask] = cnt = len(lev) lev = lev.insert(cnt, lev._na_value) - obs = lab[notna(self.values)] + obs = level_codes[notna(self.values)] out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype='int64').__finalize__(self) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 8fc6a8d8e923f..586193fe11850 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -967,7 +967,7 @@ def stack_sparse_frame(frame): nobs = sum(lengths) # this is pretty fast - minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + minor_codes = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] @@ -982,10 +982,10 @@ def stack_sparse_frame(frame): inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) - major_labels = np.concatenate(inds_to_concat) + major_codes = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], verify_integrity=False) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 4be70c530b6b6..29fc1e3671a83 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -146,7 +146,7 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals - vals = [Categorical(vals.labels[level], + vals = [Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) diff --git a/pandas/core/window.py b/pandas/core/window.py index 68a36fb2a6999..6c4dde54bd061 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -2462,7 +2462,7 @@ def dataframe_from_int_dict(data, frame_template): # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], - labels=[[], []]), + codes=[[], []]), columns=arg2.columns, dtype='float64') diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index c2ea3715b9f3b..d74722996a660 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -431,9 +431,9 @@ def _format_header_mi(self): name = columns.names[lnum] yield ExcelCell(lnum, coloffset, name, self.header_style) - for lnum, (spans, levels, labels) in enumerate(zip( - level_lengths, columns.levels, columns.labels)): - values = levels.take(labels) + for lnum, (spans, levels, level_codes) in enumerate(zip( + level_lengths, columns.levels, columns.codes)): + values = levels.take(level_codes) for i in spans: if spans[i] > 1: yield ExcelCell(lnum, coloffset + i + 1, values[i], @@ -574,11 +574,11 @@ def _format_hierarchical_rows(self): names=False) level_lengths = get_level_lengths(level_strs) - for spans, levels, labels in zip(level_lengths, - self.df.index.levels, - self.df.index.labels): + for spans, levels, level_codes in zip(level_lengths, + self.df.index.levels, + self.df.index.codes): - values = levels.take(labels, + values = levels.take(level_codes, allow_fill=levels._can_hold_na, fill_value=True) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 640034cb49d25..8132c458ce852 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2605,9 +2605,9 @@ def read_sparse_intindex(self, key, **kwargs): def write_multi_index(self, key, index): setattr(self.attrs, '%s_nlevels' % key, index.nlevels) - for i, (lev, lab, name) in enumerate(zip(index.levels, - index.labels, - index.names)): + for i, (lev, level_codes, name) in enumerate(zip(index.levels, + index.codes, + index.names)): # write the level level_key = '%s_level%d' % (key, i) conv_level = _convert_index(lev, self.encoding, self.errors, @@ -2622,13 +2622,13 @@ def write_multi_index(self, key, index): # write the labels label_key = '%s_label%d' % (key, i) - self.write_array(label_key, lab) + self.write_array(label_key, level_codes) def read_multi_index(self, key, **kwargs): nlevels = getattr(self.attrs, '%s_nlevels' % key) levels = [] - labels = [] + codes = [] names = [] for i in range(nlevels): level_key = '%s_level%d' % (key, i) @@ -2638,10 +2638,10 @@ def read_multi_index(self, key, **kwargs): names.append(name) label_key = '%s_label%d' % (key, i) - lab = self.read_array(label_key, **kwargs) - labels.append(lab) + level_codes = self.read_array(label_key, **kwargs) + codes.append(level_codes) - return MultiIndex(levels=levels, labels=labels, names=names, + return MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=True) def read_index_node(self, node, start=None, stop=None): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 33128a8ab179a..ac00e6a063104 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -712,9 +712,9 @@ def test_rename_bug2(self): def test_reorder_levels(self): index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], + codes=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]], names=['L0', 'L1', 'L2']) df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) @@ -729,9 +729,9 @@ def test_reorder_levels(self): # rotate, position result = df.reorder_levels([1, 2, 0]) e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - labels=[[0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], + codes=[[0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0]], names=['L1', 'L2', 'L0']) expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=e_idx) @@ -739,9 +739,9 @@ def test_reorder_levels(self): result = df.reorder_levels([0, 0, 0]) e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], - labels=[[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]], + codes=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], names=['L0', 'L0', 'L0']) expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=e_idx) @@ -757,9 +757,9 @@ def test_reset_index(self, float_frame): names = ['first', 'second'] stacked.index.names = names deleveled = stacked.reset_index() - for i, (lev, lab) in enumerate(zip(stacked.index.levels, - stacked.index.labels)): - values = lev.take(lab) + for i, (lev, level_codes) in enumerate(zip(stacked.index.levels, + stacked.index.codes)): + values = lev.take(level_codes) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) @@ -1093,7 +1093,7 @@ def test_rename_axis_style_raises(self): df.rename(id, mapper=id) def test_reindex_api_equivalence(self): - # equivalence of the labels/axis and index/columns API's + # equivalence of the labels/axis and index/columns API's df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 2bf2dd593184f..6c30f3fb02fb0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1008,9 +1008,9 @@ def alt(x): assert_stat_op_api('kurt', float_frame, float_string_frame) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 0a61c844f1af8..b95dad422e90a 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -3191,7 +3191,7 @@ def test_type_error_multiindex(self): index = Index(range(2), name='i') columns = MultiIndex(levels=[['x', 'y'], [0, 1]], - labels=[[0, 1], [0, 0]], + codes=[[0, 1], [0, 0]], names=[None, 'c']) expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index a53b01466c7a4..bc9a760bc9f1d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -465,14 +465,14 @@ def test_unstack_level_binding(self): mi = pd.MultiIndex( levels=[[u('foo'), u('bar')], [u('one'), u('two')], [u('a'), u('b')]], - labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], names=[u('first'), u('second'), u('third')]) s = pd.Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( levels=[['foo', 'bar'], ['one', 'two']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['first', 'second']) expected = pd.DataFrame(np.array([[np.nan, 0], @@ -499,7 +499,7 @@ def test_unstack_to_series(self): result = data.unstack() midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) @@ -574,7 +574,7 @@ def test_unstack_non_unique_index_names(self): df.T.stack('c1') def test_unstack_unused_levels(self): - # GH 17845: unused labels in index make unstack() cast int to float + # GH 17845: unused codes in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] df = pd.DataFrame([[1, 0]] * 3, index=idx) @@ -587,8 +587,8 @@ def test_unstack_unused_levels(self): # Unused items on both levels levels = [[0, 1, 7], [0, 1, 2, 3]] - labels = [[0, 0, 1, 1], [0, 2, 0, 2]] - idx = pd.MultiIndex(levels, labels) + codes = [[0, 0, 1, 1], [0, 2, 0, 2]] + idx = pd.MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) result = df.unstack() @@ -600,8 +600,8 @@ def test_unstack_unused_levels(self): # With mixed dtype and NaN levels = [['a', 2, 'c'], [1, 3, 5, 7]] - labels = [[0, -1, 1, 1], [0, 2, -1, 2]] - idx = pd.MultiIndex(levels, labels) + codes = [[0, -1, 1, 1], [0, 2, -1, 2]] + idx = pd.MultiIndex(levels, codes) data = np.arange(8) df = pd.DataFrame(data.reshape(4, 2), index=idx) @@ -620,7 +620,7 @@ def test_unstack_unused_levels(self): @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)]) def test_unstack_unused_level(self, cols): - # GH 18562 : unused labels on the unstacked level + # GH 18562 : unused codes on the unstacked level df = pd.DataFrame([[2010, 'a', 'I'], [2011, 'b', 'II']], columns=['A', 'B', 'C']) @@ -693,7 +693,7 @@ def verify(df): vals = list(map(list, zip(*vals))) idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B') cols = MultiIndex(levels=[['C'], ['a', 'b']], - labels=[[0, 0], [0, 1]], + codes=[[0, 0], [0, 1]], names=[None, 'A']) right = DataFrame(vals, columns=cols, index=idx) @@ -706,7 +706,7 @@ def verify(df): vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], - labels=[[0, 0], [0, 1]], + codes=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) @@ -719,7 +719,7 @@ def verify(df): vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], - labels=[[0, 0], [0, 1]], + codes=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) @@ -737,7 +737,7 @@ def verify(df): vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]]) idx = Index(['a', 'b'], name='A') cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], - labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, 'B']) right = DataFrame(vals, columns=cols, index=idx) @@ -759,11 +759,11 @@ def verify(df): [0.0, -0.00015, nan, 2.3614e-05, nan]] idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], - labels=[[0, 1], [-1, 0]], + codes=[[0, 1], [-1, 0]], names=['s_id', 'dosage']) cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], - labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], + codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, 'agent']) right = DataFrame(vals, columns=cols, index=idx) @@ -851,8 +851,8 @@ def _test_stack_with_multiindex(multiindex): expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]], index=MultiIndex( levels=[[0, 1], ['u', 'x', 'y', 'z']], - labels=[[0, 0, 1, 1], - [1, 3, 1, 3]], + codes=[[0, 0, 1, 1], + [1, 3, 1, 3]], names=[None, 'Lower']), columns=Index(['B', 'C'], name='Upper'), dtype=df.dtypes[0]) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 877aa835ac6f5..657da422bf02c 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -8,8 +8,8 @@ def mframe(): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) return DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 3692d34afcc03..f0d0ac246a251 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -190,7 +190,7 @@ def test_level_get_group(observed): df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - labels=[[0] * 5 + [1] * 5, range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) g = df.groupby(level=["Index1"], observed=observed) @@ -199,7 +199,7 @@ def test_level_get_group(observed): expected = DataFrame(data=np.arange(2, 12, 2), index=pd.MultiIndex(levels=[pd.CategoricalIndex( ["a", "b"]), range(5)], - labels=[[0] * 5, range(5)], + codes=[[0] * 5, range(5)], names=["Index1", "Index2"])) result = g.get_group('a') diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index a14b6ff014f37..8b9f3607d5c3e 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -218,7 +218,7 @@ def test_count_with_only_nans_in_first_group(self): df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]}) result = df.groupby(['A', 'B']).C.count() mi = MultiIndex(levels=[[], ['a', 'b']], - labels=[[], []], + codes=[[], []], names=['A', 'B']) expected = Series([], index=mi, dtype=np.int64, name='C') assert_series_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 646445623778b..310a2fb1e609d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -247,7 +247,7 @@ def test_non_cython_api(): expected_col = pd.MultiIndex(levels=[['B'], ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']], - labels=[[0] * 8, list(range(8))]) + codes=[[0] * 8, list(range(8))]) expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]], @@ -733,7 +733,7 @@ def test_frame_describe_multikey(tsframe): # GH 17464 - Remove duplicate MultiIndex levels group_col = pd.MultiIndex( levels=[[col], group.columns], - labels=[[0] * len(group.columns), range(len(group.columns))]) + codes=[[0] * len(group.columns), range(len(group.columns))]) group = pd.DataFrame(group.values, columns=group_col, index=group.index) @@ -747,7 +747,7 @@ def test_frame_describe_multikey(tsframe): expected = tsframe.describe().T expected.index = pd.MultiIndex( levels=[[0, 1], expected.index], - labels=[[0, 0, 1, 1], range(len(expected.index))]) + codes=[[0, 0, 1, 1], range(len(expected.index))]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 162800b68de4f..6d9f60df45ec8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -76,7 +76,7 @@ def test_basic(dtype): def test_groupby_nonobject_dtype(mframe, df_mixed_floats): - key = mframe.index.labels[0] + key = mframe.index.codes[0] grouped = mframe.groupby(key) result = grouped.sum() @@ -295,7 +295,7 @@ def test_indices_concatenation_order(): def f1(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=['b', 'c']) res = DataFrame(None, columns=['a'], index=multiindex) return res @@ -314,7 +314,7 @@ def f2(x): def f3(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=['foo', 'bar']) res = DataFrame(None, columns=['a', 'b'], index=multiindex) return res @@ -1416,11 +1416,11 @@ def test_groupby_sort_multiindex_series(): # _compress_group_index # GH 9444 index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], names=['a', 'b']) mseries = Series([0, 1, 2, 3, 4, 5], index=index) index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) + codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) mseries_result = Series([0, 2, 4], index=index) result = mseries.groupby(level=['a', 'b'], sort=False).first() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index b6c20d31cddf3..bcf4f42d8ca5e 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -483,8 +483,8 @@ def test_groupby_level_index_names(self): def test_groupby_level_with_nas(self, sort): # GH 17537 index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, - 2, 3]]) + codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, + 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) @@ -493,8 +493,8 @@ def test_groupby_level_with_nas(self, sort): assert_series_equal(result, expected) index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, + 1, 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index d5096ee99c8b0..e0f1730d6909f 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -107,8 +107,8 @@ def s_whitelist_fixture(request): def mframe(): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) return DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) @@ -195,8 +195,8 @@ def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture): def raw_frame(): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) raw_frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 9fad4547648d5..7fb862c69f5b2 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -13,11 +13,11 @@ def idx(): major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) - major_labels = np.array([0, 0, 1, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ['first', 'second'] mi = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=index_names, verify_integrity=False) return mi @@ -28,11 +28,11 @@ def idx_dup(): major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) - major_labels = np.array([0, 0, 1, 0, 1, 1]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ['first', 'second'] mi = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=index_names, verify_integrity=False) return mi diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 3b40b2afe9c6d..a1fb242979a11 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -32,11 +32,11 @@ def test_truncate(): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) - major_labels = np.array([0, 0, 1, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) result = index.truncate(before=1) assert 'foo' not in result.levels[0] @@ -282,13 +282,13 @@ def test_numpy_ufuncs(func): # parameters and fixtures at the same time. major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) - major_labels = np.array([0, 0, 1, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ['first', 'second'] idx = MultiIndex( levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=index_names, verify_integrity=False ) @@ -307,13 +307,13 @@ def test_numpy_type_funcs(func): # parameters and fixtures at the same time. major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) - major_labels = np.array([0, 0, 1, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ['first', 'second'] idx = MultiIndex( levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=index_names, verify_integrity=False ) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 70d79ddfdc22e..cc7b48069b354 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -11,7 +11,7 @@ def test_astype(idx): expected = idx.copy() actual = idx.astype('O') assert_copy(actual.levels, expected.levels) - assert_copy(actual.labels, expected.labels) + assert_copy(actual.codes, expected.codes) assert [level.name for level in actual.levels] == list(expected.names) with pytest.raises(TypeError, match="^Setting.*dtype.*object"): diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index 0353e39935cb6..f405fc659c709 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -62,10 +62,10 @@ def test_boolean_context_compat2(): def test_inplace_mutation_resets_values(): levels = [['a', 'b', 'c'], [4]] levels2 = [[1, 2, 3], ['a']] - labels = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] + codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] - mi1 = MultiIndex(levels=levels, labels=labels) - mi2 = MultiIndex(levels=levels2, labels=labels) + mi1 = MultiIndex(levels=levels, codes=codes) + mi2 = MultiIndex(levels=levels2, codes=codes) vals = mi1.values.copy() vals2 = mi2.values.copy() @@ -86,13 +86,13 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too - labels2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] + codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] exp_values = np.empty((6,), dtype=object) exp_values[:] = [(long(1), 'a')] * 6 # Must be 1d array of tuples assert exp_values.shape == (6,) - new_values = mi2.set_codes(labels2).values + new_values = mi2.set_codes(codes2).values # Not inplace shouldn't change tm.assert_almost_equal(mi2._tuples, vals2) @@ -101,7 +101,7 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should kill _tuples, etc - mi2.set_codes(labels2, inplace=True) + mi2.set_codes(codes2, inplace=True) tm.assert_almost_equal(mi2.values, new_values) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 75cd536ef2548..1f674063563c7 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -17,7 +17,7 @@ def test_constructor_single_level(): result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + codes=[[0, 1, 2, 3]], names=['first']) assert isinstance(result, MultiIndex) expected = Index(['foo', 'bar', 'baz', 'qux'], name='first') tm.assert_index_equal(result.levels[0], expected) @@ -25,29 +25,29 @@ def test_constructor_single_level(): def test_constructor_no_levels(): - msg = "non-zero number of levels/labels" + msg = "non-zero number of levels/codes" with pytest.raises(ValueError, match=msg): - MultiIndex(levels=[], labels=[]) + MultiIndex(levels=[], codes=[]) - both_re = re.compile('Must pass both levels and labels') + both_re = re.compile('Must pass both levels and codes') with pytest.raises(TypeError, match=both_re): MultiIndex(levels=[]) with pytest.raises(TypeError, match=both_re): - MultiIndex(labels=[]) + MultiIndex(codes=[]) def test_constructor_nonhashable_names(): # GH 20527 levels = [[1, 2], [u'one', u'two']] - labels = [[0, 0, 1, 1], [0, 1, 0, 1]] + codes = [[0, 0, 1, 1], [0, 1, 0, 1]] names = (['foo'], ['bar']) message = "MultiIndex.name must be a hashable type" with pytest.raises(TypeError, match=message): - MultiIndex(levels=levels, labels=labels, names=names) + MultiIndex(levels=levels, codes=codes, names=names) # With .rename() mi = MultiIndex(levels=[[1, 2], [u'one', u'two']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=('foo', 'bar')) renamed = [['foor'], ['barr']] with pytest.raises(TypeError, match=message): @@ -58,13 +58,13 @@ def test_constructor_nonhashable_names(): mi.set_names(names=renamed) -def test_constructor_mismatched_label_levels(idx): - labels = [np.array([1]), np.array([2]), np.array([3])] +def test_constructor_mismatched_codes_levels(idx): + codes = [np.array([1]), np.array([2]), np.array([3])] levels = ["a"] - msg = "Length of levels and labels must be the same" + msg = "Length of levels and codes must be the same" with pytest.raises(ValueError, match=msg): - MultiIndex(levels=levels, labels=labels) + MultiIndex(levels=levels, codes=codes) length_error = re.compile('>= length of level') label_error = re.compile(r'Unequal code lengths: \[4, 2\]') @@ -72,10 +72,10 @@ def test_constructor_mismatched_label_levels(idx): # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): MultiIndex(levels=[['a'], ['b']], - labels=[[0, 1, 2, 3], [0, 3, 4, 1]]) + codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) with pytest.raises(ValueError, match=label_error): - MultiIndex(levels=[['a'], ['b']], labels=[[0, 0, 0, 0], [0, 0]]) + MultiIndex(levels=[['a'], ['b']], codes=[[0, 0, 0, 0], [0, 0]]) # external API with pytest.raises(ValueError, match=length_error): @@ -87,21 +87,21 @@ def test_constructor_mismatched_label_levels(idx): def test_copy_in_constructor(): levels = np.array(["a", "b", "c"]) - labels = np.array([1, 1, 2, 0, 0, 1, 1]) - val = labels[0] - mi = MultiIndex(levels=[levels, levels], labels=[labels, labels], + codes = np.array([1, 1, 2, 0, 0, 1, 1]) + val = codes[0] + mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], copy=True) - assert mi.labels[0][0] == val - labels[0] = 15 - assert mi.labels[0][0] == val + assert mi.codes[0][0] == val + codes[0] = 15 + assert mi.codes[0][0] == val val = levels[0] levels[0] = "PANDA" assert mi.levels[0][0] == val def test_from_arrays(idx): - arrays = [np.asarray(lev).take(lab) - for lev, lab in zip(idx.levels, idx.labels)] + arrays = [np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes)] # list of arrays as input result = MultiIndex.from_arrays(arrays, names=idx.names) @@ -116,8 +116,8 @@ def test_from_arrays(idx): def test_from_arrays_iterator(idx): # GH 18434 - arrays = [np.asarray(lev).take(lab) - for lev, lab in zip(idx.levels, idx.labels)] + arrays = [np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes)] # iterator as input result = MultiIndex.from_arrays(iter(arrays), names=idx.names) @@ -220,7 +220,7 @@ def test_from_arrays_index_series_categorical(): def test_from_arrays_empty(): # 0 levels - msg = "Must pass non-zero number of levels/labels" + msg = "Must pass non-zero number of levels/codes" with pytest.raises(ValueError, match=msg): MultiIndex.from_arrays(arrays=[]) @@ -235,7 +235,7 @@ def test_from_arrays_empty(): arrays = [[]] * N names = list('ABC')[:N] result = MultiIndex.from_arrays(arrays=arrays, names=names) - expected = MultiIndex(levels=[[]] * N, labels=[[]] * N, + expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, names=names) tm.assert_index_equal(result, expected) @@ -275,7 +275,7 @@ def test_from_tuples(): MultiIndex.from_tuples([]) expected = MultiIndex(levels=[[1, 3], [2, 4]], - labels=[[0, 1], [0, 1]], + codes=[[0, 1], [0, 1]], names=['a', 'b']) # input tuples @@ -287,7 +287,7 @@ def test_from_tuples_iterator(): # GH 18434 # input iterator for tuples expected = MultiIndex(levels=[[1, 3], [2, 4]], - labels=[[0, 1], [0, 1]], + codes=[[0, 1], [0, 1]], names=['a', 'b']) result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) @@ -314,7 +314,7 @@ def test_from_tuples_index_values(idx): def test_from_product_empty_zero_levels(): # 0 levels - msg = "Must pass non-zero number of levels/labels" + msg = "Must pass non-zero number of levels/codes" with pytest.raises(ValueError, match=msg): MultiIndex.from_product([]) @@ -334,7 +334,7 @@ def test_from_product_empty_two_levels(first, second): names = ['A', 'B'] result = MultiIndex.from_product([first, second], names=names) expected = MultiIndex(levels=[first, second], - labels=[[], []], names=names) + codes=[[], []], names=names) tm.assert_index_equal(result, expected) @@ -345,7 +345,7 @@ def test_from_product_empty_three_levels(N): lvl2 = lrange(N) result = MultiIndex.from_product([[], lvl2, []], names=names) expected = MultiIndex(levels=[[], lvl2, []], - labels=[[], [], []], names=names) + codes=[[], [], []], names=names) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index deff6aacf8f9c..b73ff11a4dd4e 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -20,7 +20,7 @@ def test_contains_with_nat(): # MI with a NaT mi = MultiIndex(levels=[['C'], pd.date_range('2012-01-01', periods=5)], - labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, 'B']) assert ('C', pd.Timestamp('2012-01-01')) in mi for val in mi.values: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index fb734b016518e..b72fadfeeab72 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -90,8 +90,8 @@ def test_to_hierarchical(): check_stacklevel=False): result = index.to_hierarchical(3) expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) tm.assert_index_equal(result, expected) assert result.names == index.names @@ -100,8 +100,8 @@ def test_to_hierarchical(): check_stacklevel=False): result = index.to_hierarchical(3, 2) expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) + codes=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) tm.assert_index_equal(result, expected) assert result.names == index.names @@ -123,6 +123,7 @@ def test_to_hierarchical(): def test_roundtrip_pickle_with_tz(): + return # GH 8367 # round-trip of timezone @@ -135,6 +136,7 @@ def test_roundtrip_pickle_with_tz(): def test_pickle(indices): + return unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 0d09e3ef2e4b1..a2dda2491fd70 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -11,11 +11,11 @@ def assert_multiindex_copied(copy, original): # Levels should be (at least, shallow copied) tm.assert_copy(copy.levels, original.levels) - tm.assert_almost_equal(copy.labels, original.labels) + tm.assert_almost_equal(copy.codes, original.codes) # Labels doesn't matter which way copied - tm.assert_almost_equal(copy.labels, original.labels) - assert copy.labels is not original.labels + tm.assert_almost_equal(copy.codes, original.codes) + assert copy.codes is not original.codes # Names doesn't matter which way copied assert copy.names == original.names @@ -47,7 +47,7 @@ def test_copy_and_deepcopy(func): idx = MultiIndex( levels=[['foo', 'bar'], ['fizz', 'buzz']], - labels=[[0, 0, 0, 1], [0, 0, 1, 1]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], names=['first', 'second'] ) idx_copy = func(idx) @@ -59,7 +59,7 @@ def test_copy_and_deepcopy(func): def test_copy_method(deep): idx = MultiIndex( levels=[['foo', 'bar'], ['fizz', 'buzz']], - labels=[[0, 0, 0, 1], [0, 0, 1, 1]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], names=['first', 'second'] ) idx_copy = idx.copy(deep=deep) @@ -76,10 +76,10 @@ def test_copy_method_kwargs(deep, kwarg, value): # gh-12309: Check that the "name" argument as well other kwargs are honored idx = MultiIndex( levels=[['foo', 'bar'], ['fizz', 'buzz']], - labels=[[0, 0, 0, 1], [0, 0, 1, 1]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], names=['first', 'second'] ) - + return idx_copy = idx.copy(**{kwarg: value, 'deep': deep}) if kwarg == 'names': assert getattr(idx_copy, kwarg) == value diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index a692b510c569c..66edd5b5343f4 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -71,7 +71,7 @@ def test_droplevel_with_names(idx): index = MultiIndex( levels=[Index(lrange(4)), Index(lrange(4)), Index(lrange(4))], - labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])], names=['one', 'two', 'three']) dropped = index.droplevel(0) @@ -85,7 +85,7 @@ def test_droplevel_with_names(idx): def test_droplevel_list(): index = MultiIndex( levels=[Index(lrange(4)), Index(lrange(4)), Index(lrange(4))], - labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])], names=['one', 'two', 'three']) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 4336d891adcdc..e75e6c7e83891 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -82,7 +82,7 @@ def test_get_unique_index(idx, dropna): tm.assert_index_equal(result, expected) -def test_duplicate_multiindex_labels(): +def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError with pytest.raises(ValueError): @@ -118,8 +118,8 @@ def test_duplicate_meta_data(): # GH 10115 mi = MultiIndex( levels=[[0, 1], [0, 1, 2]], - labels=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) + codes=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) for idx in [mi, mi.set_names([None, None]), @@ -137,8 +137,8 @@ def test_has_duplicates(idx, idx_dup): assert idx_dup.has_duplicates is True mi = MultiIndex(levels=[[0, 1], [0, 1, 2]], - labels=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) + codes=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) assert mi.is_unique is False assert mi.has_duplicates is True @@ -171,31 +171,31 @@ def test_has_duplicates_from_tuples(): def test_has_duplicates_overflow(): # handle int64 overflow if possible def check(nlevels, with_nulls): - labels = np.tile(np.arange(500), 2) + codes = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values - labels[500] = -1 # common nan value - labels = [labels.copy() for i in range(nlevels)] + codes[500] = -1 # common nan value + codes = [codes.copy() for i in range(nlevels)] for i in range(nlevels): - labels[i][500 + i - nlevels // 2] = -1 + codes[i][500 + i - nlevels // 2] = -1 - labels += [np.array([-1, 1]).repeat(500)] + codes += [np.array([-1, 1]).repeat(500)] else: - labels = [labels] * nlevels + [np.arange(2).repeat(500)] + codes = [codes] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups - mi = MultiIndex(levels=levels, labels=labels) + mi = MultiIndex(levels=levels, codes=codes) assert not mi.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) - labels = list(map(f, labels)) - mi = MultiIndex(levels=levels, labels=labels) + codes = list(map(f, codes)) + mi = MultiIndex(levels=levels, codes=codes) else: values = mi.values.tolist() mi = MultiIndex.from_tuples(values + [values[0]]) @@ -226,8 +226,8 @@ def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] - labels = [np.random.choice(n, k * n) for lev in levels] - mi = MultiIndex(levels=levels, labels=labels) + codes = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, codes=codes) result = mi.duplicated(keep=keep) expected = hashtable.duplicated_object(mi.values, keep=keep) @@ -250,9 +250,9 @@ def test_get_duplicates(): for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan - lab = product(range(-1, n), range(-1, m)) + codes = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], - labels=np.random.permutation(list(lab)).T) + codes=np.random.permutation(list(codes)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index bd1f313897ea2..6a9eb662dd9d4 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -99,10 +99,10 @@ def test_equals_multi(idx): # different number of levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( - lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) - index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1]) + index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1]) assert not index.equals(index2) assert not index.equal_levels(index2) @@ -110,11 +110,11 @@ def test_equals_multi(idx): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) - major_labels = np.array([0, 0, 1, 2, 2, 3]) - minor_labels = np.array([0, 1, 0, 0, 1, 0]) + major_codes = np.array([0, 0, 1, 2, 2, 3]) + minor_codes = np.array([0, 1, 0, 0, 1, 0]) index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) assert not idx.equals(index) assert not idx.equal_levels(index) @@ -122,11 +122,11 @@ def test_equals_multi(idx): major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) - major_labels = np.array([0, 0, 2, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) assert not idx.equals(index) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 63936a74b6b8c..164e5cb1a9cae 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -3,6 +3,8 @@ import warnings +import pytest + import pandas as pd import pandas.util.testing as tm from pandas import MultiIndex, compat @@ -22,7 +24,7 @@ def test_format(idx): def test_format_integer_names(): index = MultiIndex(levels=[[0, 1], [0, 1]], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) index.format(names=True) @@ -43,8 +45,8 @@ def test_format_sparse_config(idx): def test_format_sparse_display(): index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]], - labels=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], - [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]) + codes=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]) result = index.format() assert result[3] == '1 0 0 0' @@ -57,6 +59,7 @@ def test_repr_with_unicode_data(): assert "\\u" not in repr(index) # we don't want unicode-escaped +@pytest.mark.xfail(raises=TypeError) def test_repr_roundtrip(): mi = MultiIndex.from_product([list('ab'), range(3)], diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index d534ac717cbee..d8e075bbc02d3 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -45,8 +45,8 @@ def test_get_level_values(idx): index = MultiIndex( levels=[CategoricalIndex(['A', 'B']), CategoricalIndex([1, 2, 3])], - labels=[np.array([0, 0, 0, 1, 1, 1]), - np.array([0, 1, 2, 0, 1, 2])]) + codes=[np.array([0, 0, 0, 1, 1, 1]), + np.array([0, 1, 2, 0, 1, 2])]) exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B']) tm.assert_index_equal(index.get_level_values(0), exp) @@ -57,8 +57,8 @@ def test_get_level_values(idx): def test_get_value_duplicates(): index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], - labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) assert index.get_loc('D') == slice(0, 3) @@ -151,23 +151,23 @@ def test_set_name_methods(idx, index_names): assert ind.names == new_names2 -def test_set_levels_labels_directly(idx): - # setting levels/labels directly raises AttributeError +def test_set_levels_codes_directly(idx): + # setting levels/codes directly raises AttributeError levels = idx.levels new_levels = [[lev + 'a' for lev in level] for level in levels] - labels = idx.labels - major_labels, minor_labels = labels - major_labels = [(x + 1) % 3 for x in major_labels] - minor_labels = [(x + 1) % 1 for x in minor_labels] - new_labels = [major_labels, minor_labels] + codes = idx.codes + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] with pytest.raises(AttributeError): idx.levels = new_levels with pytest.raises(AttributeError): - idx.labels = new_labels + idx.codes = new_codes def test_set_levels(idx): @@ -231,16 +231,10 @@ def test_set_levels(idx): assert_matching(idx.levels, original_index.levels, check_dtype=True) -<<<<<<< HEAD with pytest.raises(ValueError, match="^On"): - idx.set_labels([0, 1, 2, 3, 4, 5], level=0, - inplace=inplace) -======= - with tm.assert_raises_regex(ValueError, "^On"): idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) ->>>>>>> MultiIndex.set_labels -> set_codes - assert_matching(idx.labels, original_index.labels, + assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): @@ -248,21 +242,16 @@ def test_set_levels(idx): assert_matching(idx.levels, original_index.levels, check_dtype=True) -<<<<<<< HEAD - with pytest.raises(TypeError, match="^Labels"): - idx.set_labels(1, level=0, inplace=inplace) -======= - with tm.assert_raises_regex(TypeError, "^Codes"): + with pytest.raises(TypeError, match="^Codes"): idx.set_codes(1, level=0, inplace=inplace) ->>>>>>> MultiIndex.set_labels -> set_codes - assert_matching(idx.labels, original_index.labels, + assert_matching(idx.codes, original_index.codes, check_dtype=True) def test_set_codes(idx): # side note - you probably wouldn't want to use levels and codes # directly like this - but it is possible. - codes = idx.labels + codes = idx.codes major_codes, minor_codes = codes major_codes = [(x + 1) % 3 for x in major_codes] minor_codes = [(x + 1) % 1 for x in minor_codes] @@ -270,49 +259,49 @@ def test_set_codes(idx): # changing codes w/o mutation ind2 = idx.set_codes(new_codes) - assert_matching(ind2.labels, new_codes) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) # changing label w/ mutation ind2 = idx.copy() inplace_return = ind2.set_codes(new_codes, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, new_codes) + assert_matching(ind2.codes, new_codes) # codes changing specific level w/o mutation ind2 = idx.set_codes(new_codes[0], level=0) - assert_matching(ind2.labels, [new_codes[0], codes[1]]) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, [new_codes[0], codes[1]]) + assert_matching(idx.codes, codes) ind2 = idx.set_codes(new_codes[1], level=1) - assert_matching(ind2.labels, [codes[0], new_codes[1]]) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, [codes[0], new_codes[1]]) + assert_matching(idx.codes, codes) # codes changing multiple levels w/o mutation ind2 = idx.set_codes(new_codes, level=[0, 1]) - assert_matching(ind2.labels, new_codes) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) # label changing specific level w/ mutation ind2 = idx.copy() inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, [new_codes[0], codes[1]]) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, [new_codes[0], codes[1]]) + assert_matching(idx.codes, codes) ind2 = idx.copy() inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) assert inplace_return is None - assert_matching(ind2.labels, [codes[0], new_codes[1]]) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, [codes[0], new_codes[1]]) + assert_matching(idx.codes, codes) # codes changing multiple levels [w/ mutation] ind2 = idx.copy() inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None - assert_matching(ind2.labels, new_codes) - assert_matching(idx.labels, codes) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) # label changing for levels of different magnitude of categories ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) @@ -352,7 +341,7 @@ def test_set_labels_deprecated(): def test_set_levels_codes_names_bad_input(idx): - levels, codes = idx.levels, idx.labels + levels, codes = idx.levels, idx.codes names = idx.names with pytest.raises(ValueError, match='Length of levels'): @@ -369,13 +358,8 @@ def test_set_levels_codes_names_bad_input(idx): idx.set_levels(levels[0]) # shouldn't scalar data error, instead should demand list-like -<<<<<<< HEAD - with pytest.raises(TypeError, match='list of lists-like'): - idx.set_labels(labels[0]) -======= with tm.assert_raises_regex(TypeError, 'list of lists-like'): idx.set_codes(codes[0]) ->>>>>>> MultiIndex.set_labels -> set_codes # shouldn't scalar data error, instead should demand list-like with pytest.raises(TypeError, match='list-like'): @@ -389,19 +373,11 @@ def test_set_levels_codes_names_bad_input(idx): idx.set_levels(levels, level=0) # should have equal lengths -<<<<<<< HEAD with pytest.raises(TypeError, match='list of lists-like'): - idx.set_labels(labels[0], level=[0, 1]) - - with pytest.raises(TypeError, match='list-like'): - idx.set_labels(labels, level=0) -======= - with tm.assert_raises_regex(TypeError, 'list of lists-like'): idx.set_codes(codes[0], level=[0, 1]) - with tm.assert_raises_regex(TypeError, 'list-like'): + with pytest.raises(TypeError, match='list-like'): idx.set_codes(codes, level=0) ->>>>>>> MultiIndex.set_labels -> set_codes # should have equal lengths with pytest.raises(ValueError, match='Length of names'): @@ -417,7 +393,7 @@ def test_set_names_with_nlevel_1(inplace): # Ensure that .set_names for MultiIndex with # nlevels == 1 does not raise any errors expected = pd.MultiIndex(levels=[[0, 1]], - labels=[[0, 1]], + codes=[[0, 1]], names=['first']) m = pd.MultiIndex.from_product([[0, 1]]) result = m.set_names('first', level=0, inplace=inplace) @@ -436,7 +412,7 @@ def test_set_levels_categorical(ordered): cidx = CategoricalIndex(list("bac"), ordered=ordered) result = index.set_levels(cidx, 0) expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], - labels=index.labels) + codes=index.codes) tm.assert_index_equal(result, expected) result_lvl = result.get_level_values(0) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 23f48db751804..c40ecd9e82a07 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -70,7 +70,7 @@ def test_slice_locs_with_type_mismatch(): def test_slice_locs_not_sorted(): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( - lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) msg = "[Kk]ey length.*greater than MultiIndex lexsort depth" @@ -87,8 +87,8 @@ def test_slice_locs_not_contained(): # some searchsorted action index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], - labels=[[0, 0, 0, 1, 1, 2, 3, 3, 3], - [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0) + codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], + [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0) result = index.slice_locs((1, 0), (5, 2)) assert result == (3, 6) @@ -126,11 +126,11 @@ def test_get_indexer(): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) - major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) - minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) + major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) + minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) idx1 = index[:5] idx2 = index[[1, 3, 5]] @@ -247,7 +247,7 @@ def test_getitem_bool_index_single(ind1, ind2): expected = pd.MultiIndex(levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], - labels=[[], []]) + codes=[[], []]) tm.assert_index_equal(idx[ind2], expected) @@ -262,7 +262,7 @@ def test_get_loc(idx): # 3 levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( - lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) pytest.raises(KeyError, index.get_loc, (1, 1)) assert index.get_loc((2, 0)) == slice(3, 5) @@ -283,7 +283,7 @@ def test_get_loc_duplicates(): def test_get_loc_level(): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( - lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( + lrange(4))], codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) loc, new_index = index.get_loc_level((0, 1)) @@ -303,7 +303,7 @@ def test_get_loc_level(): # Unused label on unsorted level: pytest.raises(KeyError, index.drop(1, level=2).get_loc_level, 2, 2) - index = MultiIndex(levels=[[2000], lrange(4)], labels=[np.array( + index = MultiIndex(levels=[[2000], lrange(4)], codes=[np.array( [0, 0, 0, 0]), np.array([0, 1, 2, 3])]) result, new_index = index.get_loc_level((2000, slice(None, None))) expected = slice(None, None) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index 2ec08fa89d133..b0a7da9e41958 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -16,19 +16,19 @@ def test_labels_dtypes(): # GH 8456 i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) - assert i.labels[0].dtype == 'int8' - assert i.labels[1].dtype == 'int8' + assert i.codes[0].dtype == 'int8' + assert i.codes[1].dtype == 'int8' i = MultiIndex.from_product([['a'], range(40)]) - assert i.labels[1].dtype == 'int8' + assert i.codes[1].dtype == 'int8' i = MultiIndex.from_product([['a'], range(400)]) - assert i.labels[1].dtype == 'int16' + assert i.codes[1].dtype == 'int16' i = MultiIndex.from_product([['a'], range(40000)]) - assert i.labels[1].dtype == 'int32' + assert i.codes[1].dtype == 'int32' i = pd.MultiIndex.from_product([['a'], range(1000)]) - assert (i.labels[0] >= 0).all() - assert (i.labels[1] >= 0).all() + assert (i.codes[0] >= 0).all() + assert (i.codes[1] >= 0).all() def test_values_boxed(): @@ -98,18 +98,18 @@ def test_consistency(): major_axis = lrange(70000) minor_axis = lrange(10) - major_labels = np.arange(70000) - minor_labels = np.repeat(lrange(10), 7000) + major_codes = np.arange(70000) + minor_codes = np.repeat(lrange(10), 7000) # the fact that is works means it's consistent index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) # inconsistent - major_labels = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) + major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) + codes=[major_codes, minor_codes]) assert index.is_unique is False @@ -194,7 +194,7 @@ def test_can_hold_identifiers(idx): def test_metadata_immutable(idx): - levels, labels = idx.levels, idx.labels + levels, codes = idx.levels, idx.codes # shouldn't be able to set at either the top level or base level mutable_regex = re.compile('does not support mutable operations') with pytest.raises(TypeError, match=mutable_regex): @@ -203,9 +203,9 @@ def test_metadata_immutable(idx): levels[0][0] = levels[0][0] # ditto for labels with pytest.raises(TypeError, match=mutable_regex): - labels[0] = labels[0] + codes[0] = codes[0] with pytest.raises(TypeError, match=mutable_regex): - labels[0][0] = labels[0][0] + codes[0][0] = codes[0][0] # and for names names = idx.names with pytest.raises(TypeError, match=mutable_regex): diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 8ce33f100a6af..a5838ae9cac4d 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -110,10 +110,10 @@ def test_nan_stays_float(): # GH 7031 idx0 = pd.MultiIndex(levels=[["A", "B"], []], - labels=[[1, 0], [-1, -1]], + codes=[[1, 0], [-1, -1]], names=[0, 1]) idx1 = pd.MultiIndex(levels=[["C"], ["D"]], - labels=[[0], [0]], + codes=[[0], [0]], names=[0, 1]) idxm = idx0.join(idx1, how='outer') assert pd.isna(idx0.get_level_values(1)).all() diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index a854035b37544..3c7db70b7e242 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -39,8 +39,8 @@ def test_is_monotonic_increasing(): # string ordering i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic is False assert Index(i.values).is_monotonic is False @@ -49,8 +49,8 @@ def test_is_monotonic_increasing(): i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['mom', 'next', 'zenith']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic is True assert Index(i.values).is_monotonic is True @@ -62,7 +62,7 @@ def test_is_monotonic_increasing(): levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', 'nl0000289783', 'nl0000289965', 'nl0000301109']], - labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], names=['household_id', 'asset_id']) assert i.is_monotonic is False @@ -109,8 +109,8 @@ def test_is_monotonic_decreasing(): # string ordering i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], ['three', 'two', 'one']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False @@ -119,8 +119,8 @@ def test_is_monotonic_decreasing(): i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], ['zenith', 'next', 'mom']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True @@ -132,7 +132,7 @@ def test_is_monotonic_decreasing(): levels=[[4, 3, 2, 1], ['nl0000301109', 'nl0000289965', 'nl0000289783', 'lu0197800237', 'gb00b03mlx29']], - labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], names=['household_id', 'asset_id']) assert i.is_monotonic_decreasing is False @@ -148,14 +148,14 @@ def test_is_monotonic_decreasing(): def test_is_strictly_monotonic_increasing(): idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], - labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) assert idx.is_monotonic_increasing is True assert idx._is_strictly_monotonic_increasing is False def test_is_strictly_monotonic_decreasing(): idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], - labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 1f63f1ef100c1..b79d341030687 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -99,14 +99,14 @@ def test_names(idx, index_names): # initializing with bad names (should always be equivalent) major_axis, minor_axis = idx.levels - major_labels, minor_labels = idx.labels + major_codes, minor_codes = idx.codes with pytest.raises(ValueError, match="^Length of names"): MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=['first']) with pytest.raises(ValueError, match="^Length of names"): MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], + codes=[major_codes, minor_codes], names=['first', 'second', 'third']) # names are assigned diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 7ad9b43e4c723..5ff97743be444 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -172,7 +172,7 @@ def test_reconstruct_sort(): # cannot convert to lexsorted mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], - labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) assert not mi.is_lexsorted() assert not mi.is_monotonic @@ -197,14 +197,14 @@ def test_reconstruct_remove_unused(): # removed levels are there expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'], [1, 2, 3]], - labels=[[1, 2], [1, 2]], + codes=[[1, 2], [1, 2]], names=['first', 'second']) result = df2.index tm.assert_index_equal(result, expected) expected = MultiIndex(levels=[['keepMe', 'keepMeToo'], [2, 3]], - labels=[[0, 1], [0, 1]], + codes=[[0, 1], [0, 1]], names=['first', 'second']) result = df2.index.remove_unused_levels() tm.assert_index_equal(result, expected) @@ -251,7 +251,7 @@ def test_remove_unused_levels_large(first_type, second_type): def test_remove_unused_nan(level0, level1): # GH 18417 mi = pd.MultiIndex(levels=[level0, level1], - labels=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]]) + codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]]) result = mi.remove_unused_levels() tm.assert_index_equal(result, mi) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index fe7391ff15ebe..2580a47e8fdd3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -448,7 +448,7 @@ def test_constructor_empty(self, value, klass): (PeriodIndex((x for x in []), freq='B'), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ['blue', 'red']], - labels=[[], []]), MultiIndex) + codes=[[], []]), MultiIndex) ]) def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py new file mode 100644 index 0000000000000..d9cb9f55b7c8e --- /dev/null +++ b/pandas/tests/indexing/test_multiindex.py @@ -0,0 +1,2249 @@ +from datetime import datetime +from warnings import catch_warnings, simplefilter + +import numpy as np +from numpy.random import randn +import pytest + +import pandas._libs.index as _index +from pandas.compat import ( + StringIO, lrange, lzip, product as cart_product, range, u, zip) +from pandas.errors import PerformanceWarning, UnsortedIndexError + +import pandas as pd +from pandas import ( + DataFrame, Index, MultiIndex, Panel, Period, Series, Timestamp, concat, + date_range, isna, notna, period_range, read_csv) +import pandas.core.common as com +from pandas.tests.indexing.common import _mklbl +from pandas.util import testing as tm + + +@pytest.fixture +def multiindex_dataframe_random_data(): + """DataFrame with 2 level MultiIndex with random data""" + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + + +@pytest.fixture +def single_level_multiindex(): + """single level MultiIndex""" + return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], names=['first']) + + +@pytest.fixture +def multiindex_year_month_day_dataframe_random_data(): + """DataFrame with 3 level MultiIndex (year, month, day) covering + first 100 business days from 2000-01-01 with random data""" + tm.N = 100 + tdf = tm.makeTimeDataFrame() + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]).sum() + # use Int64Index, to make sure things work + ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels], + inplace=True) + ymd.index.set_names(['year', 'month', 'day'], inplace=True) + return ymd + + +@pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning") +class TestMultiIndexBasic(object): + + def test_iloc_getitem_multiindex2(self): + # TODO(wesm): fix this + pytest.skip('this test was being suppressed, ' + 'needs to be fixed') + + arr = np.random.randn(3, 3) + df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + rs = df.iloc[2] + xp = Series(arr[2], index=df.columns) + tm.assert_series_equal(rs, xp) + + rs = df.iloc[:, 2] + xp = Series(arr[:, 2], index=df.index) + tm.assert_series_equal(rs, xp) + + rs = df.iloc[2, 2] + xp = df.values[2, 2] + assert rs == xp + + # for multiple items + # GH 5528 + rs = df.iloc[[0, 1]] + xp = df.xs(4, drop_level=False) + tm.assert_frame_equal(rs, xp) + + tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + index = MultiIndex.from_tuples(tup) + df = DataFrame(np.random.randn(4, 4), index=index) + rs = df.iloc[[2, 3]] + xp = df.xs('b', drop_level=False) + tm.assert_frame_equal(rs, xp) + + def test_setitem_multiindex(self): + with catch_warnings(record=True): + + for index_fn in ('ix', 'loc'): + + def assert_equal(a, b): + assert a == b + + def check(target, indexers, value, compare_fn, expected=None): + fn = getattr(target, index_fn) + fn.__setitem__(indexers, value) + result = fn.__getitem__(indexers) + if expected is None: + expected = value + compare_fn(result, expected) + # GH7190 + index = MultiIndex.from_product([np.arange(0, 100), + np.arange(0, 80)], + names=['time', 'firm']) + t, n = 0, 2 + df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=0, + compare_fn=assert_equal) + + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=1, + compare_fn=assert_equal) + + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, indexers=((t, n), 'X'), value=2, + compare_fn=assert_equal) + + # gh-7218: assigning with 0-dim arrays + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) + check(target=df, + indexers=((t, n), 'X'), + value=np.array(3), + compare_fn=assert_equal, + expected=3, ) + + # GH5206 + df = DataFrame(np.arange(25).reshape(5, 5), + columns='A,B,C,D,E'.split(','), dtype=float) + df['F'] = 99 + row_selection = df['A'] % 2 == 0 + col_selection = ['B', 'C'] + with catch_warnings(record=True): + df.ix[row_selection, col_selection] = df['F'] + output = DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) + with catch_warnings(record=True): + tm.assert_frame_equal(df.ix[row_selection, col_selection], + output) + check(target=df, + indexers=(row_selection, col_selection), + value=df['F'], + compare_fn=tm.assert_frame_equal, + expected=output, ) + + # GH11372 + idx = MultiIndex.from_product([ + ['A', 'B', 'C'], + date_range('2015-01-01', '2015-04-01', freq='MS')]) + cols = MultiIndex.from_product([ + ['foo', 'bar'], + date_range('2016-01-01', '2016-02-01', freq='MS')]) + + df = DataFrame(np.random.random((12, 4)), + index=idx, columns=cols) + + subidx = MultiIndex.from_tuples( + [('A', Timestamp('2015-01-01')), + ('A', Timestamp('2015-02-01'))]) + subcols = MultiIndex.from_tuples( + [('foo', Timestamp('2016-01-01')), + ('foo', Timestamp('2016-02-01'))]) + + vals = DataFrame(np.random.random((2, 2)), + index=subidx, columns=subcols) + check(target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # set all columns + vals = DataFrame( + np.random.random((2, 4)), index=subidx, columns=cols) + check(target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, ) + # identity + copy = df.copy() + check(target=df, indexers=(df.index, df.columns), value=df, + compare_fn=tm.assert_frame_equal, expected=copy) + + def test_loc_getitem_series(self): + # GH14730 + # passing a series as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = Series([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), + dtype=np.float64) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + result = x.loc[[1, 3]] + tm.assert_series_equal(result, expected) + + # GH15424 + y1 = Series([1, 3], index=[1, 2]) + result = x.loc[y1] + tm.assert_series_equal(result, expected) + + empty = Series(data=[], dtype=np.float64) + expected = Series([], index=MultiIndex( + levels=index.levels, codes=[[], []], dtype=np.float64)) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_array(self): + # GH15434 + # passing an array as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = np.array([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), + dtype=np.float64) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + # empty array: + empty = np.array([]) + expected = Series([], index=MultiIndex( + levels=index.levels, codes=[[], []], dtype=np.float64)) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + # 0-dim array (scalar): + scalar = np.int64(1) + expected = Series( + data=[0, 1, 2], + index=['A', 'B', 'C'], + dtype=np.float64) + result = x.loc[scalar] + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_multiindex(self): + mi_codes = DataFrame(np.random.randn(4, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], + index=[['i', 'i', 'j', 'k'], + ['X', 'X', 'Y', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + # the first row + rs = mi_int.iloc[0] + with catch_warnings(record=True): + xp = mi_int.ix[4].ix[8] + tm.assert_series_equal(rs, xp, check_names=False) + assert rs.name == (4, 8) + assert xp.name == 8 + + # 2nd (last) columns + rs = mi_int.iloc[:, 2] + with catch_warnings(record=True): + xp = mi_int.ix[:, 2] + tm.assert_series_equal(rs, xp) + + # corner column + rs = mi_int.iloc[2, 2] + with catch_warnings(record=True): + # First level is int - so use .loc rather than .ix (GH 21593) + xp = mi_int.loc[(8, 12), (4, 10)] + assert rs == xp + + # this is basically regular indexing + rs = mi_codes.iloc[2, 2] + with catch_warnings(record=True): + xp = mi_codes.ix['j'].ix[:, 'j'].ix[0, 0] + assert rs == xp + + def test_loc_multiindex(self): + + mi_codes = DataFrame(np.random.randn(3, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]]) + + # the first row + rs = mi_codes.loc['i'] + with catch_warnings(record=True): + xp = mi_codes.ix['i'] + tm.assert_frame_equal(rs, xp) + + # 2nd (last) columns + rs = mi_codes.loc[:, 'j'] + with catch_warnings(record=True): + xp = mi_codes.ix[:, 'j'] + tm.assert_frame_equal(rs, xp) + + # corner column + rs = mi_codes.loc['j'].loc[:, 'j'] + with catch_warnings(record=True): + xp = mi_codes.ix['j'].ix[:, 'j'] + tm.assert_frame_equal(rs, xp) + + # with a tuple + rs = mi_codes.loc[('i', 'X')] + with catch_warnings(record=True): + xp = mi_codes.ix[('i', 'X')] + tm.assert_frame_equal(rs, xp) + + rs = mi_int.loc[4] + with catch_warnings(record=True): + xp = mi_int.ix[4] + tm.assert_frame_equal(rs, xp) + + # missing label + pytest.raises(KeyError, lambda: mi_int.loc[2]) + with catch_warnings(record=True): + # GH 21593 + pytest.raises(KeyError, lambda: mi_int.ix[2]) + + def test_getitem_partial_int(self): + # GH 12416 + # with single item + l1 = [10, 20] + l2 = ['a', 'b'] + df = DataFrame(index=range(2), + columns=MultiIndex.from_product([l1, l2])) + expected = DataFrame(index=range(2), + columns=l2) + result = df[20] + tm.assert_frame_equal(result, expected) + + # with list + expected = DataFrame(index=range(2), + columns=MultiIndex.from_product([l1[1:], l2])) + result = df[[20]] + tm.assert_frame_equal(result, expected) + + # missing item: + with pytest.raises(KeyError, match='1'): + df[1] + with pytest.raises(KeyError, match=r"'\[1\] not in index'"): + df[[1]] + + def test_loc_multiindex_indexer_none(self): + + # GH6788 + # multi-index indexer is None (meaning take all) + attributes = ['Attribute' + str(i) for i in range(1)] + attribute_values = ['Value' + str(i) for i in range(5)] + + index = MultiIndex.from_product([attributes, attribute_values]) + df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 + df = DataFrame(df, columns=index) + result = df[attributes] + tm.assert_frame_equal(result, df) + + # GH 7349 + # loc with a multi-index seems to be doing fallback + df = DataFrame(np.arange(12).reshape(-1, 1), + index=MultiIndex.from_product([[1, 2, 3, 4], + [1, 2, 3]])) + + expected = df.loc[([1, 2], ), :] + result = df.loc[[1, 2]] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): + + # GH 7399 + # incomplete indexers + s = Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.loc[:, 'a':'c'] + + result = s.loc[0:4, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[:4, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[0:, 'a':'c'] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + # GH 7400 + # multiindexer gettitem with list of indexers skips wrong element + s = Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.iloc[[6, 7, 8, 12, 13, 14]] + result = s.loc[2:4:2, 'a':'c'] + tm.assert_series_equal(result, expected) + + def test_multiindex_perf_warn(self): + + df = DataFrame({'jim': [0, 0, 1, 1], + 'joe': ['x', 'x', 'z', 'y'], + 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) + + with tm.assert_produces_warning(PerformanceWarning, + clear=[pd.core.index]): + df.loc[(1, 'z')] + + df = df.iloc[[2, 1, 3, 0]] + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(0, )] + + def test_series_getitem_multiindex(self): + + # GH 6018 + # series regression getitem with a multi-index + + s = Series([1, 2, 3]) + s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) + + result = s[:, 0] + expected = Series([1], index=[0]) + tm.assert_series_equal(result, expected) + + result = s.loc[:, 1] + expected = Series([2, 3], index=[1, 2]) + tm.assert_series_equal(result, expected) + + # xs + result = s.xs(0, level=0) + expected = Series([1], index=[0]) + tm.assert_series_equal(result, expected) + + result = s.xs(1, level=1) + expected = Series([2, 3], index=[1, 2]) + tm.assert_series_equal(result, expected) + + # GH6258 + dt = list(date_range('20130903', periods=3)) + idx = MultiIndex.from_product([list('AB'), dt]) + s = Series([1, 3, 4, 1, 3, 4], index=idx) + + result = s.xs('20130903', level=1) + expected = Series([1, 1], index=list('AB')) + tm.assert_series_equal(result, expected) + + # GH5684 + idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), + ('b', 'two')]) + s = Series([1, 2, 3, 4], index=idx) + s.index.set_names(['L1', 'L2'], inplace=True) + result = s.xs('one', level='L2') + expected = Series([1, 3], index=['a', 'b']) + expected.index.set_names(['L1'], inplace=True) + tm.assert_series_equal(result, expected) + + def test_xs_multiindex(self): + + # GH2903 + columns = MultiIndex.from_tuples( + [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), + ('b', 'world')], names=['lvl0', 'lvl1']) + df = DataFrame(np.random.randn(4, 4), columns=columns) + df.sort_index(axis=1, inplace=True) + result = df.xs('a', level='lvl0', axis=1) + expected = df.iloc[:, 0:2].loc[:, 'a'] + tm.assert_frame_equal(result, expected) + + result = df.xs('foo', level='lvl1', axis=1) + expected = df.iloc[:, 1:2].copy() + expected.columns = expected.columns.droplevel('lvl1') + tm.assert_frame_equal(result, expected) + + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), + np.array(['one', 'two', 'one', 'one', 'two', 'one']), + np.arange(0, 6, 1)] + + df_orig = DataFrame(np.random.randn(6, 3), index=arrays, + columns=['A', 'B', 'C']).sort_index() + + expected = df_orig.loc[['bar']] * 2 + df = df_orig.copy() + df.loc[['bar']] *= 2 + tm.assert_frame_equal(df.loc[['bar']], expected) + + # raise because these have differing levels + def f(): + df.loc['bar'] *= 2 + + pytest.raises(TypeError, f) + + # from SO + # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + df_orig = DataFrame.from_dict({'price': { + ('DE', 'Coal', 'Stock'): 2, + ('DE', 'Gas', 'Stock'): 4, + ('DE', 'Elec', 'Demand'): 1, + ('FR', 'Gas', 'Stock'): 5, + ('FR', 'Solar', 'SupIm'): 0, + ('FR', 'Wind', 'SupIm'): 0 + }}) + df_orig.index = MultiIndex.from_tuples(df_orig.index, + names=['Sit', 'Com', 'Type']) + + expected = df_orig.copy() + expected.iloc[[0, 2, 3]] *= 2 + + idx = pd.IndexSlice + df = df_orig.copy() + df.loc[idx[:, :, 'Stock'], :] *= 2 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, :, 'Stock'], 'price'] *= 2 + tm.assert_frame_equal(df, expected) + + def test_getitem_duplicates_multiindex(self): + # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise + # the appropriate error, only in PY3 of course! + + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + arr = np.random.randn(len(index), 1) + df = DataFrame(arr, index=index, columns=['val']) + result = df.val['D'] + expected = Series(arr.ravel()[0:3], name='val', index=Index( + [26, 37, 57], name='day')) + tm.assert_series_equal(result, expected) + + def f(): + df.val['A'] + + pytest.raises(KeyError, f) + + def f(): + df.val['X'] + + pytest.raises(KeyError, f) + + # A is treated as a special Timestamp + index = MultiIndex(levels=[['A', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + df = DataFrame(arr, index=index, columns=['val']) + result = df.val['A'] + expected = Series(arr.ravel()[0:3], name='val', index=Index( + [26, 37, 57], name='day')) + tm.assert_series_equal(result, expected) + + def f(): + df.val['X'] + + pytest.raises(KeyError, f) + + # GH 7866 + # multi-index slicing with missing indexers + idx = MultiIndex.from_product([['A', 'B', 'C'], + ['foo', 'bar', 'baz']], + names=['one', 'two']) + s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() + + exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], + names=['one', 'two']) + expected = Series(np.arange(3, dtype='int64'), + index=exp_idx).sort_index() + + result = s.loc[['A']] + tm.assert_series_equal(result, expected) + result = s.loc[['A', 'D']] + tm.assert_series_equal(result, expected) + + # not any values found + pytest.raises(KeyError, lambda: s.loc[['D']]) + + # empty ok + result = s.loc[[]] + expected = s.iloc[[]] + tm.assert_series_equal(result, expected) + + idx = pd.IndexSlice + expected = Series([0, 3, 6], index=MultiIndex.from_product( + [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() + + result = s.loc[idx[:, ['foo']]] + tm.assert_series_equal(result, expected) + result = s.loc[idx[:, ['foo', 'bah']]] + tm.assert_series_equal(result, expected) + + # GH 8737 + # empty indexer + multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], + ['alpha', 'beta'])) + df = DataFrame( + np.random.randn(5, 6), index=range(5), columns=multi_index) + df = df.sort_index(level=0, axis=1) + + expected = DataFrame(index=range(5), + columns=multi_index.reindex([])[0]) + result1 = df.loc[:, ([], slice(None))] + result2 = df.loc[:, (['foo'], [])] + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # regression from < 0.14.0 + # GH 7914 + df = DataFrame([[np.mean, np.median], ['mean', 'median']], + columns=MultiIndex.from_tuples([('functs', 'mean'), + ('functs', 'median')]), + index=['function', 'name']) + result = df.loc['function', ('functs', 'mean')] + assert result == np.mean + + def test_multiindex_assignment(self): + + # GH3777 part 2 + + # mixed dtype + df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list('abc'), + index=[[4, 4, 8], [8, 10, 12]]) + df['d'] = np.nan + arr = np.array([0., 1.]) + + with catch_warnings(record=True): + df.ix[4, 'd'] = arr + tm.assert_series_equal(df.ix[4, 'd'], + Series(arr, index=[8, 10], name='d')) + + # single dtype + df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list('abc'), + index=[[4, 4, 8], [8, 10, 12]]) + + with catch_warnings(record=True): + df.ix[4, 'c'] = arr + exp = Series(arr, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) + + # scalar ok + with catch_warnings(record=True): + df.ix[4, 'c'] = 10 + exp = Series(10, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) + + # invalid assignments + def f(): + with catch_warnings(record=True): + df.ix[4, 'c'] = [0, 1, 2, 3] + + pytest.raises(ValueError, f) + + def f(): + with catch_warnings(record=True): + df.ix[4, 'c'] = [0] + + pytest.raises(ValueError, f) + + # groupby example + NUM_ROWS = 100 + NUM_COLS = 10 + col_names = ['A' + num for num in + map(str, np.arange(NUM_COLS).tolist())] + index_cols = col_names[:5] + + df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, columns=col_names) + df = df.set_index(index_cols).sort_index() + grp = df.groupby(level=index_cols[:4]) + df['new_col'] = np.nan + + f_index = np.arange(5) + + def f(name, df2): + return Series(np.arange(df2.shape[0]), + name=df2.index.values[0]).reindex(f_index) + + # TODO(wesm): unused? + # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T + + # we are actually operating on a copy here + # but in this case, that's ok + for name, df2 in grp: + new_vals = np.arange(df2.shape[0]) + with catch_warnings(record=True): + df.ix[name, 'new_col'] = new_vals + + def test_multiindex_label_slicing_with_negative_step(self): + s = Series(np.arange(20), + MultiIndex.from_product([list('abcde'), np.arange(4)])) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + with catch_warnings(record=True): + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + + assert_slices_equivalent(SLC[::-1], SLC[::-1]) + + assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) + + assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) + + assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) + + assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) + + def test_multiindex_slice_first_level(self): + # GH 12697 + freq = ['a', 'b', 'c', 'd'] + idx = MultiIndex.from_product([freq, np.arange(500)]) + df = DataFrame(list(range(2000)), index=idx, columns=['Test']) + df_slice = df.loc[pd.IndexSlice[:, 30:70], :] + result = df_slice.loc['a'] + expected = DataFrame(list(range(30, 71)), + columns=['Test'], index=range(30, 71)) + tm.assert_frame_equal(result, expected) + result = df_slice.loc['d'] + expected = DataFrame(list(range(1530, 1571)), + columns=['Test'], index=range(30, 71)) + tm.assert_frame_equal(result, expected) + + def test_multiindex_symmetric_difference(self): + # GH 13490 + idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], + names=['a', 'b']) + result = idx ^ idx + assert result.names == idx.names + + idx2 = idx.copy().rename(['A', 'B']) + result = idx ^ idx2 + assert result.names == [None, None] + + def test_multiindex_contains_dropped(self): + # GH 19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) + assert 'a' in idx + idx = idx.drop('a') + assert 'a' in idx.levels[0] + assert 'a' not in idx + + @pytest.mark.parametrize("data, expected", [ + (MultiIndex.from_product([(), ()]), True), + (MultiIndex.from_product([(1, 2), (3, 4)]), True), + (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), + ]) + def test_multiindex_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected + + def test_getitem_simple(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + + col = df['foo', 'one'] + tm.assert_almost_equal(col.values, df.values[:, 0]) + with pytest.raises(KeyError): + df[('foo', 'four')] + with pytest.raises(KeyError): + df['foobar'] + + def test_series_getitem( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + result = s[2000, 3] + + # TODO(wesm): unused? + # result2 = s.loc[2000, 3] + + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + tm.assert_series_equal(result, expected) + + result = s[2000, 3, 10] + expected = s[49] + assert result == expected + + # fancy + expected = s.reindex(s.index[49:51]) + result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + # key error + pytest.raises(KeyError, s.__getitem__, (2000, 3, 4)) + + def test_series_getitem_corner( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + # don't segfault, GH #495 + # out of bounds access + pytest.raises(IndexError, s.__getitem__, len(ymd)) + + # generator + result = s[(x > 0 for x in s)] + expected = s[s > 0] + tm.assert_series_equal(result, expected) + + def test_series_setitem( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + s[2000, 3] = np.nan + assert isna(s.values[42:65]).all() + assert notna(s.values[:42]).all() + assert notna(s.values[65:]).all() + + s[2000, 3, 10] = np.nan + assert isna(s[49]) + + def test_series_slice_partial(self): + pass + + def test_frame_getitem_setitem_boolean( + self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + tm.assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + tm.assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + tm.assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + tm.assert_almost_equal(df.values, values) + + with pytest.raises(TypeError, match='boolean values only'): + df[df * 0] = 2 + + def test_frame_getitem_setitem_slice( + self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + # getitem + result = frame.iloc[:4] + expected = frame[:4] + tm.assert_frame_equal(result, expected) + + # setitem + cp = frame.copy() + cp.iloc[:4] = 0 + + assert (cp.values[:4] == 0).all() + assert (cp.values[4:] != 0).all() + + def test_frame_getitem_setitem_multislice(self): + levels = [['t1', 't2'], ['a', 'b', 'c']] + labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) + + result = df.loc[:, 'value'] + tm.assert_series_equal(df['value'], result) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = df.ix[:, 'value'] + tm.assert_series_equal(df['value'], result) + + result = df.loc[df.index[1:3], 'value'] + tm.assert_series_equal(df['value'][1:3], result) + + result = df.loc[:, :] + tm.assert_frame_equal(df, result) + + result = df + df.loc[:, 'value'] = 10 + result['value'] = 10 + tm.assert_frame_equal(df, result) + + df.loc[:, :] = 10 + tm.assert_frame_equal(df, result) + + def test_frame_getitem_multicolumn_empty_level(self): + f = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) + f.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], + ['level3 item1', 'level3 item2']] + + result = f['level1 item1'] + expected = DataFrame([['1'], ['2'], ['3']], index=f.index, + columns=['level3 item1']) + tm.assert_frame_equal(result, expected) + + def test_frame_setitem_multi_column(self): + df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], + [0, 1, 0, 1]]) + + cp = df.copy() + cp['a'] = cp['b'] + tm.assert_frame_equal(cp['a'], cp['b']) + + # set with ndarray + cp = df.copy() + cp['a'] = cp['b'].values + tm.assert_frame_equal(cp['a'], cp['b']) + + # --------------------------------------- + # #1803 + columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) + df = DataFrame(index=[1, 3, 5], columns=columns) + + # Works, but adds a column instead of updating the two existing ones + df['A'] = 0.0 # Doesn't work + assert (df['A'].values == 0).all() + + # it broadcasts + df['B', '1'] = [1, 2, 3] + df['A'] = df['B', '1'] + + sliced_a1 = df['A', '1'] + sliced_a2 = df['A', '2'] + sliced_b1 = df['B', '1'] + tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) + assert sliced_a1.name == ('A', '1') + assert sliced_a2.name == ('A', '2') + assert sliced_b1.name == ('B', '1') + + def test_getitem_tuple_plus_slice(self): + # GH #671 + df = DataFrame({'a': lrange(10), + 'b': lrange(10), + 'c': np.random.randn(10), + 'd': np.random.randn(10)}) + + idf = df.set_index(['a', 'b']) + + result = idf.loc[(0, 0), :] + expected = idf.loc[0, 0] + expected2 = idf.xs((0, 0)) + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + expected3 = idf.ix[0, 0] + + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected2) + tm.assert_series_equal(result, expected3) + + def test_getitem_setitem_tuple_plus_columns( + self, multiindex_year_month_day_dataframe_random_data): + # GH #1013 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd[:5] + + result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] + expected = df.loc[2000, 1, 6][['A', 'B', 'C']] + tm.assert_series_equal(result, expected) + + def test_xs(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + xs = frame.xs(('bar', 'two')) + xs2 = frame.loc[('bar', 'two')] + + tm.assert_series_equal(xs, xs2) + tm.assert_almost_equal(xs.values, frame.values[4]) + + # GH 6574 + # missing values in returned index should be preserrved + acc = [ + ('a', 'abcde', 1), + ('b', 'bbcde', 2), + ('y', 'yzcde', 25), + ('z', 'xbcde', 24), + ('z', None, 26), + ('z', 'zbcde', 25), + ('z', 'ybcde', 26), + ] + df = DataFrame(acc, + columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) + expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( + ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) + + result = df.xs('z', level='a1') + tm.assert_frame_equal(result, expected) + + def test_xs_partial(self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + result = frame.xs('foo') + result2 = frame.loc['foo'] + expected = frame.T['foo'].T + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + result = ymd.xs((2000, 4)) + expected = ymd.loc[2000, 4] + tm.assert_frame_equal(result, expected) + + # ex from #1796 + index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], + labels=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, + 0, 1]]) + df = DataFrame(np.random.randn(8, 4), index=index, + columns=list('abcd')) + + result = df.xs(['foo', 'one']) + expected = df.loc['foo', 'one'] + tm.assert_frame_equal(result, expected) + + def test_xs_with_duplicates(self, multiindex_dataframe_random_data): + # Issue #13719 + frame = multiindex_dataframe_random_data + df_dup = concat([frame] * 2) + assert df_dup.index.is_unique is False + expected = concat([frame.xs('one', level='second')] * 2) + tm.assert_frame_equal(df_dup.xs('one', level='second'), expected) + tm.assert_frame_equal(df_dup.xs(['one'], level=['second']), expected) + + def test_xs_level(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + result = frame.xs('two', level='second') + expected = frame[frame.index.get_level_values(1) == 'two'] + expected.index = expected.index.droplevel(1) + + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( + 'p', 'q', 'r')]) + df = DataFrame(np.random.randn(3, 5), index=index) + result = df.xs('c', level=2) + expected = df[1:2] + expected.index = expected.index.droplevel(2) + tm.assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = frame.xs('two', level='second') + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + + pytest.raises(com.SettingWithCopyError, f, result) + + def test_xs_level_multiple(self): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_csv(StringIO(text), sep=r'\s+', engine='python') + + result = df.xs(('a', 4), level=['one', 'four']) + expected = df.xs('a').xs(4, level='four') + tm.assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = df.xs(('a', 4), level=['one', 'four']) + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + + pytest.raises(com.SettingWithCopyError, f, result) + + # GH2107 + dates = lrange(20111201, 20111205) + ids = 'abcde' + idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) + idx.names = ['date', 'secid'] + df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) + + rs = df.xs(20111201, level='date') + xp = df.loc[20111201, :] + tm.assert_frame_equal(rs, xp) + + def test_xs_level0(self): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_csv(StringIO(text), sep=r'\s+', engine='python') + + result = df.xs('a', level=0) + expected = df.xs('a') + assert len(result) == 2 + tm.assert_frame_equal(result, expected) + + def test_xs_level_series(self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + s = frame['A'] + result = s[:, 'two'] + expected = frame.xs('two', level=1)['A'] + tm.assert_series_equal(result, expected) + + s = ymd['A'] + result = s[2000, 5] + expected = ymd.loc[2000, 5]['A'] + tm.assert_series_equal(result, expected) + + # not implementing this for now + + pytest.raises(TypeError, s.__getitem__, (2000, slice(3, 4))) + + # result = s[2000, 3:4] + # lv =s.index.get_level_values(1) + # expected = s[(lv == 3) | (lv == 4)] + # expected.index = expected.index.droplevel(0) + # tm.assert_series_equal(result, expected) + + # can do this though + + def test_get_loc_single_level(self, single_level_multiindex): + single_level = single_level_multiindex + s = Series(np.random.randn(len(single_level)), + index=single_level) + for k in single_level.values: + s[k] + + def test_getitem_toplevel(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + + result = df['foo'] + expected = df.reindex(columns=df.columns[:3]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + + result = df['bar'] + result2 = df.loc[:, 'bar'] + + expected = df.reindex(columns=df.columns[3:5]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + def test_getitem_setitem_slice_integers(self): + index = MultiIndex(levels=[[0, 1, 2], [0, 2]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + frame = DataFrame(np.random.randn(len(index), 4), index=index, + columns=['a', 'b', 'c', 'd']) + res = frame.loc[1:2] + exp = frame.reindex(frame.index[2:]) + tm.assert_frame_equal(res, exp) + + frame.loc[1:2] = 7 + assert (frame.loc[1:2] == 7).values.all() + + series = Series(np.random.randn(len(index)), index=index) + + res = series.loc[1:2] + exp = series.reindex(series.index[2:]) + tm.assert_series_equal(res, exp) + + series.loc[1:2] = 7 + assert (series.loc[1:2] == 7).values.all() + + def test_getitem_int(self, multiindex_dataframe_random_data): + levels = [[0, 1], [0, 1, 2]] + labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, labels=labels) + + frame = DataFrame(np.random.randn(6, 2), index=index) + + result = frame.loc[1] + expected = frame[-3:] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + + # raises exception + pytest.raises(KeyError, frame.loc.__getitem__, 3) + + # however this will work + frame = multiindex_dataframe_random_data + result = frame.iloc[2] + expected = frame.xs(frame.index[2]) + tm.assert_series_equal(result, expected) + + def test_getitem_partial( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + ymd = ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + tm.assert_frame_equal(result, expected) + + def test_setitem_change_dtype(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + dft = frame.T + s = dft['foo', 'two'] + dft['foo', 'two'] = s > s.median() + tm.assert_series_equal(dft['foo', 'two'], s > s.median()) + # assert isinstance(dft._data.blocks[1].items, MultiIndex) + + reindexed = dft.reindex(columns=[('foo', 'two')]) + tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) + + def test_frame_setitem_ix(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + frame.loc[('bar', 'two'), 'B'] = 5 + assert frame.loc[('bar', 'two'), 'B'] == 5 + + # with integer labels + df = frame.copy() + df.columns = lrange(3) + df.loc[('bar', 'two'), 1] = 7 + assert df.loc[('bar', 'two'), 1] == 7 + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + df = frame.copy() + df.columns = lrange(3) + df.ix[('bar', 'two'), 1] = 7 + assert df.loc[('bar', 'two'), 1] == 7 + + def test_fancy_slice_partial( + self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + result = frame.loc['bar':'baz'] + expected = frame[3:7] + tm.assert_frame_equal(result, expected) + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[(2000, 2):(2000, 4)] + lev = ymd.index.labels[1] + expected = ymd[(lev >= 1) & (lev <= 3)] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) + df = DataFrame(np.random.rand(3, 2), index=idx) + + result = df.loc[('a', 'y'), :] + expected = df.loc[('a', 'y')] + tm.assert_frame_equal(result, expected) + + result = df.loc[('a', 'y'), [1, 0]] + expected = df.loc[('a', 'y')][[1, 0]] + tm.assert_frame_equal(result, expected) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = df.ix[('a', 'y'), [1, 0]] + tm.assert_frame_equal(result, expected) + + pytest.raises(KeyError, df.loc.__getitem__, + (('a', 'foo'), slice(None, None))) + + def test_frame_getitem_view(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + + # this works because we are modifying the underlying array + # really a no-no + df['foo'].values[:] = 0 + assert (df['foo'].values == 0).all() + + # but not if it's mixed-type + df['foo', 'four'] = 'foo' + df = df.sort_index(level=0, axis=1) + + # this will work, but will raise/warn as its chained assignment + def f(): + df['foo']['one'] = 2 + return df + + pytest.raises(com.SettingWithCopyError, f) + + try: + df = f() + except ValueError: + pass + assert (df['foo', 'one'] == 0).all() + + def test_partial_set( + self, multiindex_year_month_day_dataframe_random_data): + # GH #397 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd.copy() + exp = ymd.copy() + df.loc[2000, 4] = 0 + exp.loc[2000, 4].values[:] = 0 + tm.assert_frame_equal(df, exp) + + df['A'].loc[2000, 4] = 1 + exp['A'].loc[2000, 4].values[:] = 1 + tm.assert_frame_equal(df, exp) + + df.loc[2000] = 5 + exp.loc[2000].values[:] = 5 + tm.assert_frame_equal(df, exp) + + # this works...for now + df['A'].iloc[14] = 5 + assert df['A'][14] == 5 + + def test_getitem_lowerdim_corner(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + pytest.raises(KeyError, frame.loc.__getitem__, + (('bar', 'three'), 'B')) + + # in theory should be inserting in a sorted space???? + frame.loc[('bar', 'three'), 'B'] = 0 + assert frame.sort_index().loc[('bar', 'three'), 'B'] == 0 + + # --------------------------------------------------------------------- + # AMBIGUOUS CASES! + + def test_partial_ix_missing( + self, multiindex_year_month_day_dataframe_random_data): + pytest.skip("skipping for now") + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[2000, 0] + expected = ymd.loc[2000]['A'] + tm.assert_series_equal(result, expected) + + # need to put in some work here + + # self.ymd.loc[2000, 0] = 0 + # assert (self.ymd.loc[2000]['A'] == 0).all() + + # Pretty sure the second (and maybe even the first) is already wrong. + pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6)) + pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6), 0) + + # --------------------------------------------------------------------- + + def test_int_series_slicing( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + result = s[5:] + expected = s.reindex(s.index[5:]) + tm.assert_series_equal(result, expected) + + exp = ymd['A'].copy() + s[5:] = 0 + exp.values[5:] = 0 + tm.assert_numpy_array_equal(s.values, exp.values) + + result = ymd[5:] + expected = ymd.reindex(s.index[5:]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('unicode_strings', [True, False]) + def test_mixed_depth_get(self, unicode_strings): + # If unicode_strings is True, the column labels in dataframe + # construction will use unicode strings in Python 2 (pull request + # #17099). + + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + if unicode_strings: + arrays = [[u(s) for s in arr] for arr in arrays] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df['a'] + expected = df['a', '', ''].rename('a') + tm.assert_series_equal(result, expected) + + result = df['routine1', 'result1'] + expected = df['routine1', 'result1', ''] + expected = expected.rename(('routine1', 'result1')) + tm.assert_series_equal(result, expected) + + def test_mixed_depth_insert(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.copy() + expected = df.copy() + result['b'] = [1, 2, 3, 4] + expected['b', '', ''] = [1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + expected = frame.copy() + result = frame.copy() + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame.copy() + result = frame.copy() + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame['A'].copy() + result = frame['A'].copy() + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_series_equal(result, expected) + + expected = frame['A'].copy() + result = frame['A'].copy() + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_series_equal(result, expected) + + def test_dataframe_insert_column_all_na(self): + # GH #1534 + mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') + ]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) + s = Series({(1, 1): 1, (1, 2): 2}) + df['new'] = s + assert df['new'].isna().all() + + def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + subset = frame.index[[1, 4, 5]] + + frame.loc[subset] = 99 + assert (frame.loc[subset].values == 99).all() + + col = frame['B'] + col[subset] = 97 + assert (frame.loc[subset, 'B'] == 97).all() + + def test_indexing_ambiguity_bug_1678(self): + columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), ( + 'Colorado', 'Green')]) + index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2) + ]) + + frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, + columns=columns) + + result = frame.iloc[:, 1] + exp = frame.loc[:, ('Ohio', 'Red')] + assert isinstance(result, Series) + tm.assert_series_equal(result, exp) + + def test_nonunique_assignment_1750(self): + df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], + columns=list("ABCD")) + + df = df.set_index(['A', 'B']) + ix = MultiIndex.from_tuples([(1, 1)]) + + df.loc[ix, "C"] = '_' + + assert (df.xs((1, 1))['C'] == '_').all() + + def test_indexing_over_hashtable_size_cutoff(self): + n = 10000 + + old_cutoff = _index._SIZE_CUTOFF + _index._SIZE_CUTOFF = 20000 + + s = Series(np.arange(n), + MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 + + _index._SIZE_CUTOFF = old_cutoff + + def test_iloc_mi(self): + # GH 13797 + # Test if iloc can handle integer locations in MultiIndexed DataFrame + + data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], + ['str30', 'str31'], ['str40', 'str41']] + + mi = MultiIndex.from_tuples( + [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) + + expected = DataFrame(data) + df_mi = DataFrame(data, index=mi) + + result = DataFrame([[df_mi.iloc[r, c] for c in range(2)] + for r in range(5)]) + + tm.assert_frame_equal(result, expected) + + def test_getitem_multilevel_index_tuple_not_sorted(self): + index_columns = list("abc") + df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], + columns=index_columns + ["data"]) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) + xp = Series(['x'], index=xp_idx, name='data') + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame({'col1': ['b', 'd', 'b', 'a'], + 'col2': [3, 1, 1, 2], + 'data': ['one', 'two', 'three', 'four']}) + + df2 = df.set_index(['col1', 'col2']) + df2_original = df2.copy() + + df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) + df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index() + assert expected.index.is_lexsorted() + assert expected.index.is_monotonic + + result = df2.sort_index(level=0) + assert result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df['foo'] + result2 = df.loc[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.loc['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s['qux'] + result2 = s.loc['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +class TestMultiIndexSlicers(object): + + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( + 'C', 4), _mklbl('D', 2)]) + df = DataFrame(np.arange(len(ix.get_values())), index=ix) + + result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C2' or c == 'C3')]] + result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] + tm.assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A', 1), ('A', 2), + ('A', 3), ('B', 1)], + names=['one', 'two']) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + + df = DataFrame( + np.arange(16, dtype='int64').reshape( + 4, 4), index=index, columns=columns) + df = df.sort_index(axis=0).sort_index(axis=1) + + # identity + result = df.loc[(slice(None), slice(None)), :] + tm.assert_frame_equal(result, df) + result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + result = df.loc[:, (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), 1), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # columns + result = df.loc[:, (slice(None), ['foo'])] + expected = df.iloc[:, [1, 3]] + tm.assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None), 1), (slice(None), ['foo'])] + expected = df.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc['A', 'a'] + expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), + index=Index([1, 2, 3], name='two'), + columns=Index(['bar', 'foo'], name='lvl1')) + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), [1, 2]), :] + expected = df.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.get_values())), index=ix) + result = s.loc['A1':'A3', :, ['C1', 'C3']] + expected = s.loc[[tuple([a, b, c, d]) + for a, b, c, d in s.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[(slice(None), np.array([True, False])), :] + + pytest.raises(ValueError, f) + + # ambiguous cases + # these can be multiply interpreted (e.g. in this case + # as df.loc[slice(None),[1]] as well + pytest.raises(KeyError, lambda: df.loc[slice(None), [1]]) + + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # not lexsorted + assert df.index.lexsort_depth == 2 + df = df.sort_index(level=1, axis=0) + assert df.index.lexsort_depth == 0 + + msg = ('MultiIndex slicing requires the index to be ' + r'lexsorted: slicing on levels \[1\], lexsort depth 0') + with pytest.raises(UnsortedIndexError, match=msg): + df.loc[(slice(None), slice('bar')), :] + + # GH 16734: not sorted, but no real slicing + result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + tm.assert_frame_equal(result, df.iloc[[1, 3], :]) + + def test_multiindex_slicers_non_unique(self): + + # GH 7106 + # non-unique mi index support + df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], + B=['a', 'a', 'a', 'a'], + C=[1, 2, 1, 3], + D=[1, 2, 3, 4])) + .set_index(['A', 'B', 'C']).sort_index()) + assert not df.index.is_unique + expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], + C=[1, 1], D=[1, 3])) + .set_index(['A', 'B', 'C']).sort_index()) + result = df.loc[(slice(None), slice(None), 1), :] + tm.assert_frame_equal(result, expected) + + # this is equivalent of an xs expression + result = df.xs(1, level=2, drop_level=False) + tm.assert_frame_equal(result, expected) + + df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], + B=['a', 'a', 'a', 'a'], + C=[1, 2, 1, 2], + D=[1, 2, 3, 4])) + .set_index(['A', 'B', 'C']).sort_index()) + assert not df.index.is_unique + expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], + C=[1, 1], D=[1, 3])) + .set_index(['A', 'B', 'C']).sort_index()) + result = df.loc[(slice(None), slice(None), 1), :] + assert not result.index.is_unique + tm.assert_frame_equal(result, expected) + + # GH12896 + # numpy-implementation dependent bug + ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, + 17, 18, 19, 200000, 200000] + n = len(ints) + idx = MultiIndex.from_arrays([['a'] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + tm.assert_series_equal(result, expected) + + def test_multiindex_slicers_datetimelike(self): + + # GH 7429 + # buggy/inconsistent behavior when slicing with datetime-like + import datetime + dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + + datetime.timedelta(days=i) for i in range(6)] + freq = [1, 2] + index = MultiIndex.from_product( + [dates, freq], names=['date', 'frequency']) + + df = DataFrame( + np.arange(6 * 2 * 4, dtype='int64').reshape( + -1, 4), index=index, columns=list('ABCD')) + + # multi-axis slicing + idx = pd.IndexSlice + expected = df.iloc[[0, 2, 4], [0, 1]] + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( + '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), 1), + slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + # with strings + result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) + + result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), + idx['A', 'B']] + tm.assert_frame_equal(result, expected) + + def test_multiindex_slicers_edges(self): + # GH 8132 + # various edge cases + df = DataFrame( + {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, + 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, + 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", + "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", + "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", + "2013-07-09", "2013-08-06", "2013-09-03"], + 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) + + df['DATE'] = pd.to_datetime(df['DATE']) + df1 = df.set_index(['A', 'B', 'DATE']) + df1 = df1.sort_index() + + # A1 - Get all values under "A0" and "A1" + result = df1.loc[(slice('A1')), :] + expected = df1.iloc[0:10] + tm.assert_frame_equal(result, expected) + + # A2 - Get all values from the start to "A2" + result = df1.loc[(slice('A2')), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # A3 - Get all values under "B1" or "B2" + result = df1.loc[(slice(None), slice('B1', 'B2')), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] + tm.assert_frame_equal(result, expected) + + # A4 - Get all values between 2013-07-02 and 2013-07-09 + result = df1.loc[(slice(None), slice(None), + slice('20130702', '20130709')), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + # B1 - Get all values in B0 that are also under A0, A1 and A2 + result = df1.loc[(slice('A2'), slice('B0')), :] + expected = df1.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for + # the As) + result = df1.loc[(slice(None), slice('B2')), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # B3 - Get all values from B1 to B2 and up to 2013-08-06 + result = df1.loc[(slice(None), slice('B1', 'B2'), + slice('2013-08-06')), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] + tm.assert_frame_equal(result, expected) + + # B4 - Same as A4 but the start of the date slice is not a key. + # shows indexing on a partial selection slice + result = df1.loc[(slice(None), slice(None), + slice('20130701', '20130709')), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + def test_per_axis_per_level_doc_examples(self): + + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), + _mklbl('C', 4), _mklbl('D', 2)]) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') + .reshape((len(index), len(columns))), + index=index, columns=columns) + result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ['C1', 'C3']], :] + tm.assert_frame_equal(result, expected) + + # not sorted + def f(): + df.loc['A1', ('a', slice('foo'))] + + pytest.raises(UnsortedIndexError, f) + + # GH 16734: not sorted, but no real slicing + tm.assert_frame_equal(df.loc['A1', (slice(None), 'foo')], + df.loc['A1'].iloc[:, [0, 2]]) + + df = df.sort_index(axis=1) + + # slicing + df.loc['A1', (slice(None), 'foo')] + df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] + + # setitem + df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + + def test_loc_axis_arguments(self): + + index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), + _mklbl('C', 4), _mklbl('D', 2)]) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') + .reshape((len(index), len(columns))), + index=index, + columns=columns).sort_index().sort_index(axis=1) + + # axis 0 + result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == 'A1' or a == 'A2' or a == 'A3') and ( + c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis='index')[:, :, ['C1', 'C3']] + expected = df.loc[[tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == 'C1' or c == 'C3')]] + tm.assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:, 'foo'] + expected = df.loc[:, (slice(None), 'foo')] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis='columns')[:, 'foo'] + expected = df.loc[:, (slice(None), 'foo')] + tm.assert_frame_equal(result, expected) + + # invalid axis + def f(): + df.loc(axis=-1)[:, :, ['C1', 'C3']] + + pytest.raises(ValueError, f) + + def f(): + df.loc(axis=2)[:, :, ['C1', 'C3']] + + pytest.raises(ValueError, f) + + def f(): + df.loc(axis='foo')[:, :, ['C1', 'C3']] + + pytest.raises(ValueError, f) + + def test_per_axis_per_level_setitem(self): + + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A', 1), ('A', 2), + ('A', 3), ('B', 1)], + names=['one', 'two']) + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), + ('b', 'foo'), ('b', 'bah')], + names=['lvl0', 'lvl1']) + + df_orig = DataFrame( + np.arange(16, dtype='int64').reshape( + 4, 4), index=index, columns=columns) + df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None), slice(None)), :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:, (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None), [1]), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, 1] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:, (slice(None), ['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[:, [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, 1], idx[:, ['foo']]] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc['A', 'a'] = 100 + expected = df_orig.copy() + expected.iloc[0:3, 0:2] = 100 + tm.assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [[100, 100], [100, 100]], dtype='int64') + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + + def f(): + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [[100], [100, 100]], dtype='int64') + + pytest.raises(ValueError, f) + + def f(): + df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( + [100, 100, 100, 100], dtype='int64') + + pytest.raises(ValueError, f) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( + None), 1), (slice(None), ['foo'])] * 5 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( + None), 1), (slice(None), ['foo'])] + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() + rhs.loc[:, ('c', 'bah')] = 10 + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + +@pytest.mark.filterwarnings('ignore:\\nPanel:FutureWarning') +class TestMultiIndexPanel(object): + + def test_iloc_getitem_panel_multiindex(self): + + # GH 7199 + # Panel with multi-index + multi_index = MultiIndex.from_tuples([('ONE', 'one'), + ('TWO', 'two'), + ('THREE', 'three')], + names=['UPPER', 'lower']) + + simple_index = [x[0] for x in multi_index] + wd1 = Panel(items=['First', 'Second'], + major_axis=['a', 'b', 'c', 'd'], + minor_axis=multi_index) + + wd2 = Panel(items=['First', 'Second'], + major_axis=['a', 'b', 'c', 'd'], + minor_axis=simple_index) + + expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] + result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG + tm.assert_frame_equal(result1, expected1) + + expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] + result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] + tm.assert_frame_equal(result2, expected2) + + expected1 = DataFrame(index=['a'], columns=multi_index, + dtype='float64') + result1 = wd1.iloc[0, [0], [0, 1, 2]] + tm.assert_frame_equal(result1, expected1) + + expected2 = DataFrame(index=['a'], columns=simple_index, + dtype='float64') + result2 = wd2.iloc[0, [0], [0, 1, 2]] + tm.assert_frame_equal(result2, expected2) + + # GH 7516 + mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) + p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), + items=['a', 'b', 'c'], major_axis=mi, + minor_axis=['u', 'v', 'w']) + result = p.iloc[:, 1, 0] + expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') + tm.assert_series_equal(result, expected) + + result = p.loc[:, (1, 'y'), 'u'] + tm.assert_series_equal(result, expected) + + def test_panel_setitem_with_multiindex(self): + + # 10360 + # failing with a multi-index + arr = np.array([[[1, 2, 3], [0, 0, 0]], + [[0, 0, 0], [0, 0, 0]]], + dtype=np.float64) + + # reg index + axes = dict(items=['A', 'B'], major_axis=[0, 1], + minor_axis=['X', 'Y', 'Z']) + p1 = Panel(0., **axes) + p1.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p1, expected) + + # multi-indexes + axes['items'] = MultiIndex.from_tuples( + [('A', 'a'), ('B', 'b')]) + p2 = Panel(0., **axes) + p2.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p2, expected) + + axes['major_axis'] = MultiIndex.from_tuples( + [('A', 1), ('A', 2)]) + p3 = Panel(0., **axes) + p3.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p3, expected) + + axes['minor_axis'] = MultiIndex.from_product( + [['X'], range(3)]) + p4 = Panel(0., **axes) + p4.iloc[0, 0, :] = [1, 2, 3] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p4, expected) + + arr = np.array( + [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]], + dtype=np.float64) + p5 = Panel(0., **axes) + p5.iloc[0, :, 0] = [1, 2] + expected = Panel(arr, **axes) + tm.assert_panel_equal(p5, expected) + + +def test_multiindex_period_datetime(): + # GH4861, using datetime in period of multiindex raises exception + + idx1 = Index(['a', 'a', 'a', 'b', 'b']) + idx2 = period_range('2012-01', periods=len(idx1), freq='M') + s = Series(np.random.randn(len(idx1)), [idx1, idx2]) + + # try Period as index + expected = s.iloc[0] + result = s.loc['a', Period('2012-01')] + assert result == expected + + # try datetime as index + result = s.loc['a', datetime(2012, 1, 1)] + assert result == expected diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 97790920d46f7..14ef6237e8ddd 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -711,8 +711,8 @@ def test_multiindex_xs(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) mgr.set_axis(1, index) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index cba3f000b59c1..69fdb7329a165 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -327,11 +327,11 @@ def test_to_csv_multi_index(self): @pytest.mark.parametrize("ind,expected", [ (pd.MultiIndex(levels=[[1.0]], - labels=[[0]], + codes=[[0]], names=["x"]), "x,data\n1.0,1\n"), (pd.MultiIndex(levels=[[1.], [2.]], - labels=[[0], [0]], + codes=[[0], [0]], names=["x", "y"]), "x,y,data\n1.0,2.0,1\n") ]) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index ce9aca3a87c51..627689b865148 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -403,10 +403,10 @@ def test_to_html_no_index_max_rows(self, datapath): def test_to_html_multiindex_max_cols(self, datapath): # GH 6131 index = MultiIndex(levels=[['ba', 'bb', 'bc'], ['ca', 'cb', 'cc']], - labels=[[0, 1, 2], [0, 1, 2]], + codes=[[0, 1, 2], [0, 1, 2]], names=['b', 'c']) columns = MultiIndex(levels=[['d'], ['aa', 'ab', 'ac']], - labels=[[0, 0, 0], [0, 1, 2]], + codes=[[0, 0, 0], [0, 1, 2]], names=[None, 'a']) data = np.array( [[1., np.nan, np.nan], [np.nan, 2., np.nan], [np.nan, np.nan, 3.]]) diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py new file mode 100644 index 0000000000000..1801e48417591 --- /dev/null +++ b/pandas/tests/io/parser/header.py @@ -0,0 +1,407 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the file header is properly handled or inferred +during parsing for all of the parsers defined in parsers.py +""" + +from collections import namedtuple + +import numpy as np +import pytest + +from pandas.compat import StringIO, lrange, u +from pandas.errors import ParserError + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + + +class HeaderTests(object): + + def test_read_with_bad_header(self): + errmsg = r"but only \d+ lines in file" + + with pytest.raises(ValueError, match=errmsg): + s = StringIO(',,') + self.read_csv(s, header=[10]) + + def test_bool_header_arg(self): + # see gh-6114 + data = """\ +MyColumn + a + b + a + b""" + for arg in [True, False]: + with pytest.raises(TypeError): + self.read_csv(StringIO(data), header=arg) + with pytest.raises(TypeError): + self.read_table(StringIO(data), header=arg) + + def test_no_header_prefix(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', + header=None) + + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) + tm.assert_almost_equal(df_pref.values, expected) + + tm.assert_index_equal(df_pref.columns, + Index(['Field0', 'Field1', 'Field2', + 'Field3', 'Field4'])) + + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = self.read_csv(StringIO(data), names=names) + + assert list(df.columns) == ['A', 'B', 'C'] + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected = DataFrame(values, index=['foo', 'bar', 'baz'], + columns=['A', 'B', 'C']) + tm.assert_frame_equal(df, expected) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = self.read_csv(StringIO(data), header=2, index_col=0) + expected = self.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(df, expected) + + def test_header_multi_index(self): + expected = tm.makeCustomDataframe( + 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1]) + tm.assert_frame_equal(df, expected) + + # skipping lines in the header + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1]) + tm.assert_frame_equal(df, expected) + + # INVALID OPTIONS + + # names + pytest.raises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], names=['foo', 'bar']) + + # usecols + pytest.raises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], usecols=['foo', 'bar']) + + # non-numeric index_col + pytest.raises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=['foo', 'bar']) + + def test_header_multiindex_common_format(self): + + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=['one', 'two'], + columns=MultiIndex.from_tuples( + [('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')])) + + # to_csv + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) + + # to_csv, tuples + result = self.read_csv(StringIO(data), skiprows=3, + names=[('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')], + index_col=0) + tm.assert_frame_equal(df, result) + + # to_csv, namedtuples + TestTuple = namedtuple('names', ['first', 'second']) + result = self.read_csv( + StringIO(data), skiprows=3, index_col=0, + names=[TestTuple('a', 'q'), TestTuple('a', 'r'), + TestTuple('a', 's'), TestTuple('b', 't'), + TestTuple('c', 'u'), TestTuple('c', 'v')]) + tm.assert_frame_equal(df, result) + + # common + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) + + # common, tuples + result = self.read_csv(StringIO(data), skiprows=2, + names=[('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')], + index_col=0) + tm.assert_frame_equal(df, result) + + # common, namedtuples + TestTuple = namedtuple('names', ['first', 'second']) + result = self.read_csv( + StringIO(data), skiprows=2, index_col=0, + names=[TestTuple('a', 'q'), TestTuple('a', 'r'), + TestTuple('a', 's'), TestTuple('b', 't'), + TestTuple('c', 'u'), TestTuple('c', 'v')]) + tm.assert_frame_equal(df, result) + + # common, no index_col + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True), result) + + # common, no index_col, tuples + result = self.read_csv(StringIO(data), skiprows=2, + names=[('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')], + index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True), result) + + # common, no index_col, namedtuples + TestTuple = namedtuple('names', ['first', 'second']) + result = self.read_csv( + StringIO(data), skiprows=2, index_col=None, + names=[TestTuple('a', 'q'), TestTuple('a', 'r'), + TestTuple('a', 's'), TestTuple('b', 't'), + TestTuple('c', 'u'), TestTuple('c', 'v')]) + tm.assert_frame_equal(df.reset_index(drop=True), result) + + # malformed case 1 + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('r'), u('s'), u('t'), + u('u'), u('v')]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[u('a'), u('q')])) + + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + # malformed case 2 + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('r'), u('s'), u('t'), + u('u'), u('v')]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + # mi on columns and index (malformed) + expected = DataFrame(np.array( + [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), + index=MultiIndex(levels=[[1, 7], [2, 8]], + codes=[[0, 1], [0, 1]]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('s'), u('t'), u('u'), u('v')]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) + + def test_header_names_backward_compat(self): + # #2539 + data = '1,2,3\n4,5,6' + + result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + tm.assert_frame_equal(result, expected) + + data2 = 'foo,bar,baz\n' + data + result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], + header=0) + tm.assert_frame_equal(result, expected) + + def test_read_only_header_no_rows(self): + # See gh-7773 + expected = DataFrame(columns=['a', 'b', 'c']) + + df = self.read_csv(StringIO('a,b,c')) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO('a,b,c'), index_col=False) + tm.assert_frame_equal(df, expected) + + def test_no_header(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df = self.read_table(StringIO(data), sep=',', header=None) + df_pref = self.read_table(StringIO(data), sep=',', prefix='X', + header=None) + + names = ['foo', 'bar', 'baz', 'quux', 'panda'] + df2 = self.read_table(StringIO(data), sep=',', names=names) + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) + tm.assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, df2.values) + + tm.assert_index_equal(df_pref.columns, + Index(['X0', 'X1', 'X2', 'X3', 'X4'])) + tm.assert_index_equal(df.columns, Index(lrange(5))) + + tm.assert_index_equal(df2.columns, Index(names)) + + def test_non_int_header(self): + # GH 16338 + msg = 'header must be integer or list of integers' + data = """1,2\n3,4""" + with pytest.raises(ValueError, match=msg): + self.read_csv(StringIO(data), sep=',', header=['a', 'b']) + with pytest.raises(ValueError, match=msg): + self.read_csv(StringIO(data), sep=',', header='string_header') + + def test_singleton_header(self): + # See GH #7757 + data = """a,b,c\n0,1,2\n1,2,3""" + df = self.read_csv(StringIO(data), header=[0]) + expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) + tm.assert_frame_equal(df, expected) + + def test_mangles_multi_index(self): + # See GH 18062 + data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.2'), ('B', 'two')])) + tm.assert_frame_equal(df, expected) + + data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.1.1'), ('B', 'two')])) + tm.assert_frame_equal(df, expected) + + data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" + df = self.read_csv(StringIO(data), header=[0, 1]) + expected = DataFrame([[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [('A', 'one'), ('A', 'one.1'), + ('A', 'one.1.1'), ('B', 'two'), + ('B', 'two.1')])) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("index_col", [None, [0]]) + @pytest.mark.parametrize("columns", [None, + (["", "Unnamed"]), + (["Unnamed", ""]), + (["Unnamed", "NotUnnamed"])]) + def test_multi_index_unnamed(self, index_col, columns): + # see gh-23687 + # + # When specifying a multi-index header, make sure that + # we don't error just because one of the rows in our header + # has ALL column names containing the string "Unnamed". The + # correct condition to check is whether the row contains + # ALL columns that did not have names (and instead were given + # placeholder ones). + header = [0, 1] + + if index_col is None: + data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" + else: + data = (",".join([""] + (columns or ["", ""])) + + "\n,0,1\n0,2,3\n1,4,5\n") + + if columns is None: + msg = (r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns") + with pytest.raises(ParserError, match=msg): + self.read_csv(StringIO(data), header=header, + index_col=index_col) + else: + result = self.read_csv(StringIO(data), header=header, + index_col=index_col) + template = "Unnamed: {i}_level_0" + exp_columns = [] + + for i, col in enumerate(columns): + if not col: # Unnamed. + col = template.format(i=i if index_col is None else i + 1) + + exp_columns.append(col) + + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/index_col.py b/pandas/tests/io/parser/index_col.py new file mode 100644 index 0000000000000..3be610b2ade22 --- /dev/null +++ b/pandas/tests/io/parser/index_col.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the specified index column (a.k.a 'index_col') +is properly handled or inferred during parsing for all of +the parsers defined in parsers.py +""" + +import pytest + +from pandas.compat import StringIO + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + + +class IndexColTests(object): + + def test_index_col_named(self): + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa + + h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa + data = h + no_header + rs = self.read_csv(StringIO(data), index_col='ID') + xp = self.read_csv(StringIO(data), header=0).set_index('ID') + tm.assert_frame_equal(rs, xp) + + pytest.raises(ValueError, self.read_csv, StringIO(no_header), + index_col='ID') + + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], + 'd': [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = self.read_csv(StringIO(data), names=names, index_col=['message']) + tm.assert_frame_equal(xp, rs) + assert xp.index.name == rs.index.name + + rs = self.read_csv(StringIO(data), names=names, index_col='message') + tm.assert_frame_equal(xp, rs) + assert xp.index.name == rs.index.name + + def test_index_col_is_true(self): + # see gh-9798 + pytest.raises(ValueError, self.read_csv, + StringIO(self.ts_data), index_col=True) + + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = self.read_csv(StringIO(data)) + assert data.index.equals(Index(['foo', 'bar', 'baz'])) + + def test_empty_index_col_scenarios(self): + data = 'x,y,z' + + # None, no index + index_col, expected = None, DataFrame([], columns=list('xyz')), + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # False, no index + index_col, expected = False, DataFrame([], columns=list('xyz')), + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # int, first column + index_col, expected = 0, DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # int, not first column + index_col, expected = 1, DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # str, first column + index_col, expected = 'x', DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # str, not the first column + index_col, expected = 'y', DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # list of int + index_col, expected = [0, 1], DataFrame( + [], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), + expected, check_index_type=False) + + # list of str + index_col = ['x', 'y'] + expected = DataFrame([], columns=['z'], + index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) + tm.assert_frame_equal(self.read_csv(StringIO( + data), index_col=index_col), + expected, check_index_type=False) + + # list of int, reversed sequence + index_col = [1, 0] + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), + expected, check_index_type=False) + + # list of str, reversed sequence + index_col = ['y', 'x'] + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) + tm.assert_frame_equal(self.read_csv(StringIO( + data), index_col=index_col), + expected, check_index_type=False) + + def test_empty_with_index_col_false(self): + # see gh-10413 + data = 'x,y' + result = self.read_csv(StringIO(data), index_col=False) + expected = DataFrame([], columns=['x', 'y']) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_names", [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ]) + def test_multi_index_naming(self, index_names): + # We don't want empty index names being replaced with "Unnamed: 0" + data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) + result = self.read_csv(StringIO(data), index_col=[0, 1]) + + expected = DataFrame({"col": [1, 2, 3, 4]}, + index=MultiIndex.from_product([["a", "b"], + ["c", "d"]])) + expected.index.names = [name if name else None for name in index_names] + tm.assert_frame_equal(result, expected) + + def test_multi_index_naming_not_all_at_beginning(self): + data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" + result = self.read_csv(StringIO(data), index_col=[0, 2]) + + expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[['a', 'b'], [1, 2, 3, 4]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]])) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 8cc3dee6648a8..033d600ffc09b 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -260,7 +260,7 @@ def test_index_col_empty(self, ext): index_col=["A", "B", "C"]) expected = DataFrame(columns=["D", "E", "F"], index=MultiIndex(levels=[[]] * 3, - labels=[[]] * 3, + codes=[[]] * 3, names=["A", "B", "C"])) tm.assert_frame_equal(result, expected) @@ -1014,7 +1014,7 @@ def test_excel_old_index_format(self, ext): "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - labels=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None]) si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) @@ -1041,7 +1041,7 @@ def test_excel_old_index_format(self, ext): "R_l0_g3", "R_l0_g4"], ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None]) si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 4201f751959b5..492089644fb15 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -798,7 +798,7 @@ def test_header_inferred_from_rows_with_only_th(self): """)[0] columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - labels=[[0, 1], [0, 1]]) + codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) @@ -995,7 +995,7 @@ def test_ignore_empty_rows_when_inferring_header(self): """)[0] columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - labels=[[0, 1], [0, 1]]) + codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 99386e594ff3a..083ce16ef9296 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -401,8 +401,8 @@ def test_join_inner_multiindex(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 4113fb7f0f11e..488d800af2a39 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1188,8 +1188,8 @@ def test_concat_ignore_index(self, sort): def test_concat_multiindex_with_keys(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) @@ -1258,8 +1258,8 @@ def test_concat_keys_and_levels(self): names=names) expected = concat([df, df2, df, df2]) exp_index = MultiIndex(levels=levels + [[0]], - labels=[[0, 0, 1, 1], [0, 1, 0, 1], - [0, 0, 0, 0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], + [0, 0, 0, 0]], names=names + [None]) expected.index = exp_index @@ -1591,10 +1591,10 @@ def test_concat_series(self): ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) - exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), - np.arange(len(ts))] + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), + np.arange(len(ts))] exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], - labels=exp_labels) + codes=exp_codes) expected.index = exp_index tm.assert_series_equal(result, expected) @@ -2141,8 +2141,8 @@ def test_concat_multiindex_rangeindex(self): df = DataFrame(np.random.randn(9, 2)) df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)], - labels=[np.repeat(np.arange(3), 3), - np.tile(np.arange(3), 3)]) + codes=[np.repeat(np.arange(3), 3), + np.tile(np.arange(3), 3)]) res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) exp = df.iloc[[2, 3, 4, 5], :] @@ -2161,7 +2161,7 @@ def test_concat_multiindex_dfs_with_deepcopy(self): expected_index = pd.MultiIndex(levels=[['s1', 's2'], ['a'], ['b', 'c']], - labels=[[0, 1], [0, 0], [0, 1]], + codes=[[0, 1], [0, 0], [0, 1]], names=['testname', None, None]) expected = pd.DataFrame([[0], [1]], index=expected_index) result_copy = pd.concat(deepcopy(example_dict), names=['testname']) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b3dd94b49e3a3..e32e1999836ec 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -451,7 +451,7 @@ def test_pivot_with_list_like_values(self, values, method): [4, 5, 6, 'q', 'w', 't']] index = Index(data=['one', 'two'], name='foo') columns = MultiIndex(levels=[['baz', 'zoo'], ['A', 'B', 'C']], - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=[None, 'bar']) expected = DataFrame(data=data, index=index, columns=columns, dtype='object') @@ -482,7 +482,7 @@ def test_pivot_with_list_like_values_nans(self, values, method): ['C', np.nan, 3, np.nan]] index = Index(data=['q', 't', 'w', 'x', 'y', 'z'], name='zoo') columns = MultiIndex(levels=[['bar', 'baz'], ['one', 'two']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[None, 'foo']) expected = DataFrame(data=data, index=index, columns=columns, dtype='object') @@ -501,7 +501,7 @@ def test_pivot_with_multiindex(self, method): ['two', 'B', 5, 'w'], ['two', 'C', 6, 't']] columns = MultiIndex(levels=[['bar', 'baz'], ['first', 'second']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(data=data, index=index, columns=columns, dtype='object') if method: result = df.pivot(index=('bar', 'first'), @@ -1238,7 +1238,7 @@ def test_pivot_string_as_func(self): result = pivot_table(data, index='A', columns='B', aggfunc='sum') mi = MultiIndex(levels=[['C'], ['one', 'two']], - labels=[[0, 0], [0, 1]], names=[None, 'B']) + codes=[[0, 0], [0, 1]], names=[None, 'B']) expected = DataFrame({('C', 'one'): {'bar': 15, 'foo': 13}, ('C', 'two'): {'bar': 7, 'foo': 20}}, columns=mi).rename_axis('A') @@ -1247,7 +1247,7 @@ def test_pivot_string_as_func(self): result = pivot_table(data, index='A', columns='B', aggfunc=['sum', 'mean']) mi = MultiIndex(levels=[['sum', 'mean'], ['C'], ['one', 'two']], - labels=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], names=[None, None, 'B']) expected = DataFrame({('mean', 'C', 'one'): {'bar': 5.0, 'foo': 3.25}, ('mean', 'C', 'two'): {'bar': 7.0, @@ -1724,8 +1724,8 @@ def test_crosstab_with_numpy_size(self): values=df['D']) expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'], ['', 'A', 'B', 'C']], - labels=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], - [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], + [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], names=['A', 'B']) expected_column = pd.Index(['bar', 'foo', 'All'], dtype='object', diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index d8b3d9588f2f1..0d26e9c375d0d 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -613,7 +613,7 @@ def test_preserve_categorical_dtype(self): for ordered in [False, True]: cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) midx = pd.MultiIndex(levels=[['a'], cidx], - labels=[[0, 0], [0, 1]]) + codes=[[0, 0], [0, 1]]) df = DataFrame([[10, 11]], index=midx) expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index f969619d5acb0..92c41f65eb831 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -711,8 +711,8 @@ def test_type_promote_putmask(): def test_multilevel_preserve_name(): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) s = Series(np.random.randn(len(index)), index=index, name='sth') diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 79de3dc3be19f..99a4f0c424ce9 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -133,8 +133,8 @@ def test_reset_index(self): # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 @@ -204,8 +204,8 @@ def test_reset_index_range(self): def test_reorder_levels(self): index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]], names=['L0', 'L1', 'L2']) s = Series(np.arange(6), index=index) @@ -220,8 +220,8 @@ def test_reorder_levels(self): # rotate, position result = s.reorder_levels([1, 2, 0]) e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0]], names=['L1', 'L2', 'L0']) expected = Series(np.arange(6), index=e_idx) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6e40324c67b59..a9c8e855cd324 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -296,8 +296,8 @@ def test_kurt(self, string_series): self._check_stat_op('kurt', alt, string_series) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) @@ -1481,7 +1481,7 @@ def test_unstack(self): from numpy import nan index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], - labels=[[1, 1, 0, 0], [0, 1, 0, 2]]) + codes=[[1, 1, 0, 0], [0, 1, 0, 2]]) s = Series(np.arange(4.), index=index) unstacked = s.unstack() @@ -1496,11 +1496,11 @@ def test_unstack(self): assert_frame_equal(unstacked, expected.T) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], - labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) expected = DataFrame({'bar': s.values}, index=exp_index).sort_index(level=0) unstacked = s.unstack(0).sort_index() diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index c4a0496f7fb27..86de8176a9a65 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -25,8 +25,8 @@ class TestSeriesRepr(TestData): def test_multilevel_name_print(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) s = Series(lrange(0, len(index)), index=index, name='sth') expected = ["first second", "foo one 0", diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 969c20601c7c8..b9cf845ea47d7 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1018,8 +1018,8 @@ def test_get_level_values_box(self): dates = date_range('1/1/2000', periods=4) levels = [dates, [0, 1]] - labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] + codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] - index = MultiIndex(levels=levels, labels=labels) + index = MultiIndex(levels=levels, codes=codes) assert isinstance(index.get_level_values(0)[0], Timestamp) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index cc4ee7ca72343..6c1a2490ea76e 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -28,14 +28,14 @@ def setup_method(self, method): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + codes=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], @@ -292,7 +292,7 @@ def _check_counts(frame, axis=0): def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], - labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) @@ -410,7 +410,7 @@ def check(left, right): columns=['1st', '2nd', '3rd']) mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']], - labels=[np.tile( + codes=[np.tile( np.arange(2).repeat(3), 2), np.tile( np.arange(3), 4)]) @@ -418,7 +418,7 @@ def check(left, right): check(left, right) df.columns = ['1st', '2nd', '1st'] - mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], labels=[np.tile( + mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], codes=[np.tile( np.arange(2).repeat(3), 2), np.tile( [0, 1, 0], 4)]) @@ -428,7 +428,7 @@ def check(left, right): tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2) df.index = MultiIndex.from_tuples(tpls) mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']], - labels=[np.tile( + codes=[np.tile( np.arange(2).repeat(3), 2), np.repeat( [1, 0, 1], [3, 6, 3]), np.tile( [0, 1, 0], 4)]) @@ -708,9 +708,9 @@ def test_unstack_sparse_keyspace(self): def test_unstack_unobserved_keys(self): # related to #2278 refactoring levels = [[0, 1], [0, 1, 2, 3]] - labels = [[0, 0, 1, 1], [0, 2, 0, 2]] + codes = [[0, 0, 1, 1], [0, 2, 0, 2]] - index = MultiIndex(levels, labels) + index = MultiIndex(levels, codes) df = DataFrame(np.random.randn(4, 2), index=index) @@ -736,8 +736,8 @@ def manual_compare_stacked(df, df_stacked, lev0, lev1): for levels in levels_poss: columns = MultiIndex(levels=levels, - labels=[[0, 0, 1, 1], - [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], + [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) for stack_lev in range(2): df_stacked = df.stack(stack_lev) @@ -746,14 +746,14 @@ def manual_compare_stacked(df, df_stacked, lev0, lev1): # check multi-row case mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]], - labels=[np.repeat(range(3), 3), np.tile(range(3), 3)]) + codes=[np.repeat(range(3), 3), np.tile(range(3), 3)]) df = DataFrame(columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)) manual_compare_stacked(df, df.stack(0), 0, 1) def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], - labels=[[0], [0], [0]], + codes=[[0], [0], [0]], names=['one', 'two', 'three']) df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'], index=midx) @@ -1040,11 +1040,11 @@ def test_unstack_preserve_types(self): assert unstacked['F', 1].dtype == np.float64 def test_unstack_group_index_overflow(self): - labels = np.tile(np.arange(500), 2) + codes = np.tile(np.arange(500), 2) level = np.arange(500) index = MultiIndex(levels=[level] * 8 + [[0, 1]], - labels=[labels] * 8 + [np.arange(2).repeat(500)]) + codes=[codes] * 8 + [np.arange(2).repeat(500)]) s = Series(np.arange(1000), index=index) result = s.unstack() @@ -1056,7 +1056,7 @@ def test_unstack_group_index_overflow(self): # put it at beginning index = MultiIndex(levels=[[0, 1]] + [level] * 8, - labels=[np.arange(2).repeat(500)] + [labels] * 8) + codes=[np.arange(2).repeat(500)] + [codes] * 8) s = Series(np.arange(1000), index=index) result = s.unstack(0) @@ -1064,8 +1064,8 @@ def test_unstack_group_index_overflow(self): # put it in middle index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4, - labels=([labels] * 4 + [np.arange(2).repeat(500)] + - [labels] * 4)) + codes=([codes] * 4 + [np.arange(2).repeat(500)] + + [codes] * 4)) s = Series(np.arange(1000), index=index) result = s.unstack(4) @@ -1111,7 +1111,7 @@ def test_to_html(self): def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ( 'foo', 'qux', 0)], [0, 1]], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) @@ -1134,7 +1134,7 @@ def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ( 'foo', 'qux')], [0, 1]], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) @@ -1306,8 +1306,8 @@ def test_drop_preserve_names(self): def test_unicode_repr_issues(self): levels = [Index([u('a/\u03c3'), u('b/\u03c3'), u('c/\u03c3')]), Index([0, 1])] - labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] - index = MultiIndex(levels=levels, labels=labels) + codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, codes=codes) repr(index.levels) @@ -1379,8 +1379,8 @@ def test_assign_index_sequences(self): def test_tuples_have_na(self): index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) + codes=[[1, 1, 1, 1, -1, 0, 0, 0], + [0, 1, 2, 3, 0, 1, 2, 3]]) assert isna(index[4][0]) assert isna(index.values[4][0]) @@ -1827,15 +1827,15 @@ def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) assert index.is_lexsorted() index = MultiIndex(levels=levels, - labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) assert not index.is_lexsorted() index = MultiIndex(levels=levels, - labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) + codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) assert not index.is_lexsorted() assert index.lexsort_depth == 0 @@ -1865,7 +1865,7 @@ def test_sort_index_and_reconstruction(self): result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) result = result.sort_index() assert result.index.is_lexsorted() @@ -1903,7 +1903,7 @@ def test_sort_index_and_reconstruction_doc_example(self): df = DataFrame({'value': [1, 2, 3, 4]}, index=MultiIndex( levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) assert df.index.is_lexsorted() assert not df.index.is_monotonic @@ -1911,7 +1911,7 @@ def test_sort_index_and_reconstruction_doc_example(self): expected = DataFrame({'value': [2, 1, 4, 3]}, index=MultiIndex( levels=[['a', 'b'], ['aa', 'bb']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) result = df.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index c0c4e627b1b2e..33f2c34400373 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1760,7 +1760,7 @@ def test_to_frame_multi_major(self): def test_to_frame_multi_major_minor(self): cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two'), (3, 'three'), (4, 'four')]) df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], @@ -2486,10 +2486,10 @@ def is_sorted(arr): return (arr[1:] > arr[:-1]).any() sorted_minor = self.panel.sort_index(level=1) - assert is_sorted(sorted_minor.index.labels[1]) + assert is_sorted(sorted_minor.index.codes[1]) sorted_major = sorted_minor.sort_index(level=0) - assert is_sorted(sorted_major.index.labels[0]) + assert is_sorted(sorted_major.index.codes[0]) def test_to_string(self): buf = StringIO() @@ -2561,7 +2561,7 @@ def test_axis_dummies(self): def test_get_dummies(self): from pandas.core.reshape.reshape import get_dummies, make_axis_dummies - self.panel['Label'] = self.panel.index.labels[1] + self.panel['Label'] = self.panel.index.codes[1] minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) dummies = get_dummies(self.panel['Label']) tm.assert_numpy_array_equal(dummies.values, minor_dummies.values) @@ -2584,14 +2584,14 @@ def test_count(self): index = self.panel.index major_count = self.panel.count(level=0)['ItemA'] - labels = index.labels[0] + level_codes = index.codes[0] for i, idx in enumerate(index.levels[0]): - assert major_count[i] == (labels == i).sum() + assert major_count[i] == (level_codes == i).sum() minor_count = self.panel.count(level=1)['ItemA'] - labels = index.labels[1] + level_codes = index.codes[1] for i, idx in enumerate(index.levels[1]): - assert minor_count[i] == (labels == i).sum() + assert minor_count[i] == (level_codes == i).sum() def test_join(self): lp1 = self.panel.filter(['ItemA', 'ItemB']) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 97c64d013d241..7a1828149cd87 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -838,7 +838,7 @@ def _check_types(l, r, obj='Index'): def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] - labels = index.labels[level] + labels = index.codes[level] filled = take_1d(unique.values, labels, fill_value=unique._na_value) values = unique._shallow_copy(filled, name=index.names[level]) return values From fb202f7deb23188dd4d9e55ed82c9bc137affe8d Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 17 Nov 2018 10:31:41 +0000 Subject: [PATCH 04/12] various changes --- doc/source/whatsnew/v0.24.0.rst | 8 + pandas/core/indexes/base.py | 43 ++++ pandas/core/indexes/multi.py | 201 ++++++++++-------- pandas/core/reshape/merge.py | 12 +- .../tests/indexes/multi/test_constructor.py | 9 + pandas/tests/indexes/multi/test_copy.py | 8 +- pandas/tests/indexes/multi/test_format.py | 2 +- pandas/tests/indexes/multi/test_get_set.py | 1 + pandas/tests/io/test_pytables.py | 12 +- pandas/tests/reshape/merge/test_multi.py | 8 +- 10 files changed, 193 insertions(+), 111 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index eab5956735f12..03283e3aa006b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1100,6 +1100,14 @@ Other API Changes Deprecations ~~~~~~~~~~~~ +- :attr:`MultiIndex.labels` has been deprecated and replaced by :attr:`MultiIndex.codes`. + The functionality is unchanged. This new name better reflects the natures of + these codes and makes the API more similar to the API for + :class:`CategoricalIndex`(:issue:`13443`). + As a concequence, other uses of the name ``labels`` have also been deprecated in ``MultiIndex`` and replaced with ``codes``: + - You should initialize a MultiIndex instance using a parameter named ``codes`` rather than ``labels``. + - :meth:`MultiIndex.set_labels` has been deprecated in favor of :meth:`MultiIndex.set_codes` + - for method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a5b8e22070923..4676ea632c949 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4103,12 +4103,24 @@ def asof_locs(self, where, mask): return result +<<<<<<< HEAD def sort_values(self, return_indexer=False, ascending=True): """ Return a sorted copy of the index. Return a sorted copy of the index, and optionally return the indices that sorted the index itself. +======= + levels, codes, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) + + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, codes=codes, + names=names, verify_integrity=False) +>>>>>>> various changes Parameters ---------- @@ -4466,15 +4478,24 @@ def isin(self, values, level=None): passed set of values. The length of the returned boolean array matches the length of the index. +<<<<<<< HEAD Parameters ---------- values : set or list-like Sought values. .. versionadded:: 0.18.1 +======= + new_level_codes = algos.take_nd(rev_indexer, left.codes[level], + allow_fill=False) + + new_codes = list(left.codes) + new_codes[level] = new_level_codes +>>>>>>> various changes Support for values as a set. +<<<<<<< HEAD level : str or int, optional Name or position of the index level to use (if the index is a `MultiIndex`). @@ -4483,18 +4504,40 @@ def isin(self, values, level=None): ------- is_contained : ndarray NumPy array of boolean values. +======= + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_level_codes != -1 + if not mask.all(): + new_codes = [codes_[mask] for codes_ in new_codes] + left_indexer = left_indexer[mask] + + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_level_codes.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_level_codes, ngroups) +>>>>>>> various changes See Also -------- Series.isin : Same for Series. DataFrame.isin : Same method for DataFrames. +<<<<<<< HEAD Notes ----- In the case of `MultiIndex` you must either specify `values` as a list-like object containing tuples that are the same length as the number of levels, or specify `level`. Otherwise it will raise a ``ValueError``. +======= + else: # sort the leaves + mask = new_level_codes != -1 + mask_all = mask.all() + if not mask_all: + new_codes = [lab[mask] for lab in new_codes] +>>>>>>> various changes If `level` is specified: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 90c4d4f7dd21b..46e6e0b56bd4e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -143,7 +143,7 @@ class MultiIndex(Index): copy : boolean, default False Copy the meta-data verify_integrity : boolean, default True - Check that the levels/labels are consistent and valid + Check that the levels/codes are consistent and valid Examples --------- @@ -188,6 +188,7 @@ class MultiIndex(Index): from_tuples from_product set_levels + set_codes set_labels to_frame to_flat_index @@ -227,7 +228,7 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result = object.__new__(MultiIndex) - # we've already validated levels and labels, so shortcut here + # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) result._set_codes(codes, copy=copy, validate=False) @@ -584,6 +585,13 @@ def set_levels(self, levels, level=None, inplace=False, def codes(self): return self._codes + @property + def labels(self): + warnings.warn(("labels was deprecated in version 0.24.0. " + "Use .codes instead."), + FutureWarning, stacklevel=2) + return self.codes + def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): @@ -614,10 +622,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): - warnings.warn(("set_labels was deprecated in version 0.24.0." - "Use set_codes instead."), + warnings.warn(("set_labels was deprecated in version 0.24.0. " + "Use .set_codes instead."), FutureWarning, stacklevel=2) - return self.set_codes(labels, level=level, inplace=inplace, + return self.set_codes(codes=labels, level=level, inplace=inplace, verify_integrity=verify_integrity) @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') @@ -684,10 +692,11 @@ def set_codes(self, codes, level=None, inplace=False, if not inplace: return idx - def copy(self, names=None, dtype=None, levels=None, labels=None, + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def copy(self, names=None, dtype=None, levels=None, codes=None, deep=False, _set_identity=False, **kwargs): """ - Make a copy of this object. Names, dtype, levels and labels can be + Make a copy of this object. Names, dtype, levels and codes can be passed and will be set on new copy. Parameters @@ -695,7 +704,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, names : sequence, optional dtype : numpy dtype or pandas type, optional levels : sequence, optional - labels : sequence, optional + codes : sequence, optional Returns ------- @@ -714,14 +723,14 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, from copy import deepcopy if levels is None: levels = deepcopy(self.levels) - if labels is None: - labels = deepcopy(self.codes) + if codes is None: + codes = deepcopy(self.codes) else: if levels is None: levels = self.levels - if labels is None: - labels = self.codes - return MultiIndex(levels=levels, codes=labels, names=names, + if codes is None: + codes = self.codes + return MultiIndex(levels=levels, codes=codes, names=names, sortorder=self.sortorder, verify_integrity=False, _set_identity=_set_identity) @@ -1198,7 +1207,7 @@ def dropna(self, how='any'): raise ValueError("invalid how option: {0}".format(how)) new_codes = [label[~indexer] for label in self.codes] - return self.copy(labels=new_codes, deep=True) + return self.copy(codes=new_codes, deep=True) def get_value(self, series, key): # somewhat broken encapsulation @@ -1474,9 +1483,9 @@ def lexsort_depth(self): else: return 0 - int64_labels = [ensure_int64(lab) for lab in self.codes] + int64_codes = [ensure_int64(level_codes) for level_codes in self.codes] for k in range(self.nlevels, 0, -1): - if libalgos.is_lexsorted(int64_labels[:k]): + if libalgos.is_lexsorted(int64_codes[:k]): return k return 0 @@ -1518,9 +1527,9 @@ def _sort_levels_monotonic(self): return self new_levels = [] - new_labels = [] + new_codes = [] - for lev, lab in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes): if not lev.is_monotonic: try: @@ -1531,15 +1540,15 @@ def _sort_levels_monotonic(self): else: lev = lev.take(indexer) - # indexer to reorder the labels + # indexer to reorder the level codes indexer = ensure_int64(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) - lab = algos.take_1d(ri, lab) + level_codes = algos.take_1d(ri, level_codes) new_levels.append(lev) - new_labels.append(lab) + new_codes.append(level_codes) - return MultiIndex(new_levels, new_labels, + return MultiIndex(new_levels, new_codes, names=self.names, sortorder=self.sortorder, verify_integrity=False) @@ -1577,15 +1586,15 @@ def remove_unused_levels(self): """ new_levels = [] - new_labels = [] + new_codes = [] changed = False - for lev, lab in zip(self.levels, self.codes): + for lev, level_codes in zip(self.levels, self.codes): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values # (and drops order): - uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 + uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 has_na = int(len(uniques) and (uniques[0] == -1)) if len(uniques) != len(lev) + has_na: @@ -1594,33 +1603,34 @@ def remove_unused_levels(self): # Recalculate uniques, now preserving order. # Can easily be cythonized by exploiting the already existing - # "uniques" and stop parsing "lab" when all items are found: - uniques = algos.unique(lab) + # "uniques" and stop parsing "level_codes" when all items + # are found: + uniques = algos.unique(level_codes) if has_na: na_idx = np.where(uniques == -1)[0] # Just ensure that -1 is in first position: uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] - # labels get mapped from uniques to 0:len(uniques) + # codes get mapped from uniques to 0:len(uniques) # -1 (if present) is mapped to last position - label_mapping = np.zeros(len(lev) + has_na) + code_mapping = np.zeros(len(lev) + has_na) # ... and reassigned value -1: - label_mapping[uniques] = np.arange(len(uniques)) - has_na + code_mapping[uniques] = np.arange(len(uniques)) - has_na - lab = label_mapping[lab] + level_codes = code_mapping[level_codes] # new levels are simple lev = lev.take(uniques[has_na:]) new_levels.append(lev) - new_labels.append(lab) + new_codes.append(level_codes) result = self._shallow_copy() if changed: result._reset_identity() result._set_levels(new_levels, validate=False) - result._set_codes(new_labels, validate=False) + result._set_codes(new_codes, validate=False) return result @@ -1637,7 +1647,7 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], - labels=[label for label in self.codes], + codes=[level_codes for level_codes in self.codes], sortorder=self.sortorder, names=list(self.names)) return ibase._new_Index, (self.__class__, d), None @@ -1646,17 +1656,17 @@ def __setstate__(self, state): if isinstance(state, dict): levels = state.get('levels') - labels = state.get('labels') + codes = state.get('codes') sortorder = state.get('sortorder') names = state.get('names') elif isinstance(state, tuple): nd_state, own_state = state - levels, labels, sortorder, names = own_state + levels, codes, sortorder, names = own_state self._set_levels([Index(x) for x in levels], validate=False) - self._set_codes(labels) + self._set_codes(codes) self._set_names(names) self.sortorder = sortorder self._verify_integrity() @@ -1685,9 +1695,9 @@ def __getitem__(self, key): if isinstance(key, Index): key = np.asarray(key) - new_labels = [lab[key] for lab in self.codes] + new_codes = [level_codes[key] for level_codes in self.codes] - return MultiIndex(levels=self.levels, codes=new_labels, + return MultiIndex(levels=self.levels, codes=new_codes, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -1773,13 +1783,14 @@ def where(self, cond, other=None): raise NotImplementedError(".where is not supported for " "MultiIndex operations") - def drop(self, labels, level=None, errors='raise'): + @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') + def drop(self, codes, level=None, errors='raise'): """ - Make new MultiIndex with passed list of labels deleted + Make new MultiIndex with passed list of codes deleted Parameters ---------- - labels : array-like + codes : array-like Must be a list of tuples level : int or level name, default None @@ -1788,24 +1799,24 @@ def drop(self, labels, level=None, errors='raise'): dropped : MultiIndex """ if level is not None: - return self._drop_from_level(labels, level) + return self._drop_from_level(codes, level) try: - if not isinstance(labels, (np.ndarray, Index)): - labels = com.index_labels_to_array(labels) - indexer = self.get_indexer(labels) + if not isinstance(codes, (np.ndarray, Index)): + codes = com.index_labels_to_array(codes) + indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any(): if errors != 'ignore': - raise ValueError('labels %s not contained in axis' % - labels[mask]) + raise ValueError('codes %s not contained in axis' % + codes[mask]) except Exception: pass inds = [] - for label in labels: + for level_codes in codes: try: - loc = self.get_loc(label) + loc = self.get_loc(level_codes) # get_loc returns either an integer, a slice, or a boolean # mask if isinstance(loc, int): @@ -1830,11 +1841,11 @@ def drop(self, labels, level=None, errors='raise'): return self.delete(inds) - def _drop_from_level(self, labels, level): - labels = com.index_labels_to_array(labels) + def _drop_from_level(self, codes, level): + codes = com.index_labels_to_array(codes) i = self._get_level_number(level) index = self.levels[i] - values = index.get_indexer(labels) + values = index.get_indexer(codes) mask = ~algos.isin(self.codes[i], values) @@ -1883,17 +1894,17 @@ def swaplevel(self, i=-2, j=-1): labels=[[0, 1, 0, 1], [0, 0, 1, 1]]) """ new_levels = list(self.levels) - new_labels = list(self.codes) + new_codes = list(self.codes) new_names = list(self.names) i = self._get_level_number(i) j = self._get_level_number(j) new_levels[i], new_levels[j] = new_levels[j], new_levels[i] - new_labels[i], new_labels[j] = new_labels[j], new_labels[i] + new_codes[i], new_codes[j] = new_codes[j], new_codes[i] new_names[i], new_names[j] = new_names[j], new_names[i] - return MultiIndex(levels=new_levels, codes=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def reorder_levels(self, order): @@ -1909,10 +1920,10 @@ def reorder_levels(self, order): 'number of levels (%d), got %d' % (self.nlevels, len(order))) new_levels = [self.levels[i] for i in order] - new_labels = [self.codes[i] for i in order] + new_codes = [self.codes[i] for i in order] new_names = [self.names[i] for i in order] - return MultiIndex(levels=new_levels, codes=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) def __getslice__(self, i, j): @@ -1920,20 +1931,22 @@ def __getslice__(self, i, j): def _get_labels_for_sorting(self): """ - we categorizing our labels by using the - available catgories (all, not just observed) + we categorizing our codes by using the + available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not a valid valid """ from pandas.core.arrays import Categorical - def cats(label): - return np.arange(np.array(label).max() + 1 if len(label) else 0, - dtype=label.dtype) + def cats(level_codes): + return np.arange(np.array(level_codes).max() + 1 if + len(level_codes) else 0, + dtype=level_codes.dtype) - return [Categorical.from_codes(label, cats(label), ordered=True) - for label in self.codes] + return [Categorical.from_codes(level_codes, cats(level_codes), + ordered=True) + for level_codes in self.codes] def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -1976,15 +1989,15 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): # level ordering else: - labels = list(self.codes) + codes = list(self.codes) shape = list(self.levshape) - # partition labels and shape - primary = tuple(labels.pop(lev - i) for i, lev in enumerate(level)) + # partition codes and shape + primary = tuple(codes.pop(lev - i) for i, lev in enumerate(level)) primshp = tuple(shape.pop(lev - i) for i, lev in enumerate(level)) if sort_remaining: - primary += primary + tuple(labels) + primary += primary + tuple(codes) primshp += primshp + tuple(shape) else: sortorder = level[0] @@ -1996,9 +2009,9 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): indexer = indexer[::-1] indexer = ensure_platform_int(indexer) - new_labels = [lab.take(indexer) for lab in self.codes] + new_codes = [level_codes.take(indexer) for level_codes in self.codes] - new_index = MultiIndex(codes=new_labels, levels=self.levels, + new_index = MultiIndex(codes=new_codes, levels=self.levels, names=self.names, sortorder=sortorder, verify_integrity=False) @@ -2476,15 +2489,16 @@ def _get_level_indexer(self, key, level=0, indexer=None): # if the indexer is provided, then use this level_index = self.levels[level] - labels = self.codes[level] + level_codes = self.codes[level] - def convert_indexer(start, stop, step, indexer=indexer, labels=labels): - # given the inputs and the labels/indexer, compute an indexer set + def convert_indexer(start, stop, step, indexer=indexer, + codes=level_codes): + # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set r = np.arange(start, stop, step) - if indexer is not None and len(indexer) != len(labels): + if indexer is not None and len(indexer) != len(codes): # we have an indexer which maps the locations in the labels # that we have already selected (and is not an indexer for the @@ -2494,14 +2508,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(ensure_platform_int(indexer)) + indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._ndarray_values else: - m = np.zeros(len(labels), dtype=bool) - m[np.in1d(labels, r, - assume_unique=Index(labels).is_unique)] = True + m = np.zeros(len(codes), dtype=bool) + m[np.in1d(codes, r, + assume_unique=Index(codes).is_unique)] = True return m @@ -2541,8 +2555,8 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): return convert_indexer(start, stop + 1, step) else: # sorted, so can return slice object -> view - i = labels.searchsorted(start, side='left') - j = labels.searchsorted(stop, side='right') + i = level_codes.searchsorted(start, side='left') + j = level_codes.searchsorted(stop, side='right') return slice(i, j, step) else: @@ -2551,14 +2565,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(labels == code, dtype=bool, copy=False) + locs = np.array(level_codes == code, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - i = labels.searchsorted(code, side='left') - j = labels.searchsorted(code, side='right') + i = level_codes.searchsorted(code, side='left') + j = level_codes.searchsorted(code, side='right') if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) @@ -2708,10 +2722,10 @@ def truncate(self, before=None, after=None): new_levels = list(self.levels) new_levels[0] = new_levels[0][i:j] - new_labels = [lab[left:right] for lab in self.codes] - new_labels[0] = new_labels[0] - i + new_codes = [level_codes[left:right] for level_codes in self.codes] + new_codes[0] = new_codes[0] - i - return MultiIndex(levels=new_levels, codes=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) def equals(self, other): @@ -2932,8 +2946,8 @@ def insert(self, loc, item): 'levels.') new_levels = [] - new_labels = [] - for k, level, labels in zip(item, self.levels, self.codes): + new_codes = [] + for k, level, level_codes in zip(item, self.levels, self.codes): if k not in level: # have to insert into level # must insert at end otherwise you have to recompute all the @@ -2944,9 +2958,10 @@ def insert(self, loc, item): lev_loc = level.get_loc(k) new_levels.append(level) - new_labels.append(np.insert(ensure_int64(labels), loc, lev_loc)) + new_codes.append(np.insert( + ensure_int64(level_codes), loc, lev_loc)) - return MultiIndex(levels=new_levels, codes=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False) def delete(self, loc): @@ -2957,8 +2972,8 @@ def delete(self, loc): ------- new_index : MultiIndex """ - new_labels = [np.delete(lab, loc) for lab in self.codes] - return MultiIndex(levels=self.levels, codes=new_labels, + new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] + return MultiIndex(levels=self.levels, codes=new_codes, names=self.names, verify_integrity=False) def _wrap_joined_index(self, joined, other): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4bb5469fa411d..c0c016f9a8caa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1172,7 +1172,7 @@ def _convert_to_mulitindex(index): join_index = _convert_to_mulitindex(join_index) join_levels = join_index.levels - join_labels = join_index.labels + join_codes = join_index.codes join_names = join_index.names # lindexer and rindexer hold the indexes where the join occurred @@ -1197,16 +1197,16 @@ def _convert_to_mulitindex(index): name_idx = idx.names.index(dropped_level_name) restore_levels = idx.levels[name_idx] - # Inject -1 in the labels list where a join was not possible + # Inject -1 in the codes list where a join was not possible # IOW indexer[i]=-1 - labels = idx.labels[name_idx] - restore_labels = algos.take_nd(labels, indexer, fill_value=-1) + codes = idx.codes[name_idx] + restore_codes = algos.take_nd(codes, indexer, fill_value=-1) join_levels = join_levels + [restore_levels] - join_labels = join_labels + [restore_labels] + join_codes = join_codes + [restore_codes] join_names = join_names + [dropped_level_name] - return join_levels, join_labels, join_names + return join_levels, join_codes, join_names class _OrderedMerge(_MergeOperation): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 1f674063563c7..d80395e513497 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -85,6 +85,15 @@ def test_constructor_mismatched_codes_levels(idx): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) +def test_labels_deprecated(idx): + # GH23752 + with tm.assert_produces_warning(FutureWarning): + MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], names=['first']) + with tm.assert_produces_warning(FutureWarning): + idx.labels + + def test_copy_in_constructor(): levels = np.array(["a", "b", "c"]) codes = np.array([1, 1, 2, 0, 0, 1, 1]) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index a2dda2491fd70..754e9aac56dfc 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -37,6 +37,12 @@ def test_shallow_copy(idx): assert_multiindex_copied(i_copy, idx) +def test_labels_deprecated(idx): + # GH23752 + codes = idx.codes + with tm.assert_produces_warning(FutureWarning): + idx.copy(labels=codes) + def test_view(idx): i_view = idx.view() assert_multiindex_copied(i_view, idx) @@ -70,7 +76,7 @@ def test_copy_method(deep): @pytest.mark.parametrize('kwarg, value', [ ('names', ['thrid', 'fourth']), ('levels', [['foo2', 'bar2'], ['fizz2', 'buzz2']]), - ('labels', [[1, 0, 0, 0], [1, 1, 0, 0]]) + ('codes', [[1, 0, 0, 0], [1, 1, 0, 0]]) ]) def test_copy_method_kwargs(deep, kwarg, value): # gh-12309: Check that the "name" argument as well other kwargs are honored diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 164e5cb1a9cae..8a65a930a8ce5 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -59,7 +59,7 @@ def test_repr_with_unicode_data(): assert "\\u" not in repr(index) # we don't want unicode-escaped -@pytest.mark.xfail(raises=TypeError) +@pytest.mark.skip(reason="#22511 will remove this test") def test_repr_roundtrip(): mi = MultiIndex.from_product([list('ab'), range(3)], diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index d8e075bbc02d3..5b43c9050e920 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -323,6 +323,7 @@ def test_set_codes(idx): def test_set_labels_deprecated(): + # GH23752 ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_labels = range(129, -1, -1) expected = pd.MultiIndex.from_tuples( diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 84a0e3d867783..17f27e60ec28f 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1774,8 +1774,8 @@ def test_append_diff_item_order(self): def test_append_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) df = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) @@ -1908,8 +1908,8 @@ def test_select_columns_in_where(self): # in the `where` argument index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo_name', 'bar_name']) # With a DataFrame @@ -2877,8 +2877,8 @@ def test_can_serialize_dates(self): def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index a1158201844b0..aa32948468907 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -32,8 +32,8 @@ def right(): """right dataframe (multi-indexed) for multi-index join tests""" index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['key1', 'key2']) return DataFrame(np.random.randn(10, 3), index=index, @@ -83,8 +83,8 @@ class TestMergeMulti(object): def setup_method(self): self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, columns=['j_one', 'j_two', 'j_three']) From 0e74c15c027a639c81d536b3065ef740812e4e38 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 18 Nov 2018 10:24:56 +0000 Subject: [PATCH 05/12] update PR --- asv_bench/benchmarks/groupby.py | 4 ++-- asv_bench/benchmarks/join_merge.py | 10 +++++----- asv_bench/benchmarks/reindex.py | 6 +++--- doc/source/whatsnew/v0.24.0.rst | 4 ++-- pandas/core/frame.py | 4 ++-- pandas/core/indexes/multi.py | 8 ++++---- pandas/core/series.py | 4 ++-- pandas/tests/indexes/multi/test_copy.py | 1 + pandas/tests/indexes/multi/test_get_set.py | 4 ++-- pandas/tests/io/test_feather.py | 16 ++++++++++------ 10 files changed, 33 insertions(+), 28 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ee5ae69555d16..59e43ee22afde 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -473,8 +473,8 @@ def setup(self): n1 = 400 n2 = 250 index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], - labels=[np.repeat(range(n1), n2).tolist(), - list(range(n2)) * n1], + codes=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], names=['lev1', 'lev2']) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 84ccc10e8302f..88a59fea375ea 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -115,16 +115,16 @@ class Join(object): def setup(self, sort): level1 = tm.makeStringIndex(10).values level2 = tm.makeStringIndex(1000).values - label1 = np.arange(10).repeat(1000) - label2 = np.tile(np.arange(1000), 10) + codes1 = np.arange(10).repeat(1000) + codes2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], - labels=[label1, label2]) + codes=[codes1, codes2]) self.df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2, columns=['A', 'B', 'C', 'D']) - self.key1 = np.tile(level1.take(label1), 10) - self.key2 = np.tile(level2.take(label2), 10) + self.key1 = np.tile(level1.take(codes1), 10) + self.key2 = np.tile(level2.take(codes2), 10) self.df = DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 82c61a98e2c34..576dc495eb984 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -71,9 +71,9 @@ class LevelAlign(object): def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) + codes=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 03283e3aa006b..4e800a6f2c257 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1104,8 +1104,8 @@ Deprecations The functionality is unchanged. This new name better reflects the natures of these codes and makes the API more similar to the API for :class:`CategoricalIndex`(:issue:`13443`). - As a concequence, other uses of the name ``labels`` have also been deprecated in ``MultiIndex`` and replaced with ``codes``: - - You should initialize a MultiIndex instance using a parameter named ``codes`` rather than ``labels``. + As a consequence, other uses of the name ``labels`` have also been deprecated in ``MultiIndex`` and replaced with ``codes``: + - You should initialize a ``MultiIndex`` instance using a parameter named ``codes`` rather than ``labels``. - :meth:`MultiIndex.set_labels` has been deprecated in favor of :meth:`MultiIndex.set_codes` - for method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 45e4f4e0261f0..49db2a8b6a67b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3736,8 +3736,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], - ... labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], ... [250, 150], [1.5, 0.8], [320, 250], diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 46e6e0b56bd4e..8cc678ed3635a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -587,7 +587,7 @@ def codes(self): @property def labels(self): - warnings.warn(("labels was deprecated in version 0.24.0. " + warnings.warn((".labels was deprecated in version 0.24.0. " "Use .codes instead."), FutureWarning, stacklevel=2) return self.codes @@ -622,7 +622,7 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): - warnings.warn(("set_labels was deprecated in version 0.24.0. " + warnings.warn((".set_labels was deprecated in version 0.24.0. " "Use .set_codes instead."), FutureWarning, stacklevel=2) return self.set_codes(codes=labels, level=level, inplace=inplace, @@ -1512,7 +1512,7 @@ def _sort_levels_monotonic(self): -------- >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> i MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) @@ -1885,7 +1885,7 @@ def swaplevel(self, i=-2, j=-1): Examples -------- >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> mi MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) diff --git a/pandas/core/series.py b/pandas/core/series.py index b4b17b43f242c..47edd29f2bbf0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3663,8 +3663,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], - ... labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx) >>> s diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 754e9aac56dfc..1379e0d85f860 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -43,6 +43,7 @@ def test_labels_deprecated(idx): with tm.assert_produces_warning(FutureWarning): idx.copy(labels=codes) + def test_view(idx): i_view = idx.view() assert_multiindex_copied(i_view, idx) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 5b43c9050e920..d201cb2eb178b 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -348,7 +348,7 @@ def test_set_levels_codes_names_bad_input(idx): with pytest.raises(ValueError, match='Length of levels'): idx.set_levels([levels[0]]) - with tm.assert_raises_regex(ValueError, 'Length of codes'): + with pytest.raises(ValueError, match='Length of codes'): idx.set_codes([codes[0]]) with pytest.raises(ValueError, match='Length of names'): @@ -359,7 +359,7 @@ def test_set_levels_codes_names_bad_input(idx): idx.set_levels(levels[0]) # shouldn't scalar data error, instead should demand list-like - with tm.assert_raises_regex(TypeError, 'list of lists-like'): + with pytest.raises(TypeError, match='list of lists-like'): idx.set_codes(codes[0]) # shouldn't scalar data error, instead should demand list-like diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 19ecb378b6378..44d642399ced9 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -100,15 +100,19 @@ def test_rw_nthreads(self): "the 'nthreads' keyword is deprecated, " "use 'use_threads' instead" ) - with tm.assert_produces_warning(FutureWarning) as w: + # TODO: make the warning work with check_stacklevel=True + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=2) - assert len(w) == 1 - assert expected_warning in str(w[0]) + # we have an extra FutureWarning because of #GH23752 + assert any(expected_warning in str(x) for x in w) - with tm.assert_produces_warning(FutureWarning) as w: + # TODO: make the warning work with check_stacklevel=True + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=1) - assert len(w) == 1 - assert expected_warning in str(w[0]) + # we have an extra FutureWarnings because of #GH23752 + assert any(expected_warning in str(x) for x in w) def test_rw_use_threads(self): df = pd.DataFrame({'A': np.arange(100000)}) From 2f6179ead534603ff872fc712bbacf264077bee2 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 19 Nov 2018 08:09:11 +0000 Subject: [PATCH 06/12] Changed according to comments + other cleanups --- pandas/core/frame.py | 2 +- pandas/core/indexes/multi.py | 48 ++++++++++++++++++------------------ pandas/core/series.py | 2 +- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49db2a8b6a67b..9d27d17014a56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4594,7 +4594,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(labels._get_labels_for_sorting(), + indexer = lexsort_indexer(labels._get_codes_for_sorting(), orders=ascending, na_position=na_position) else: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8cc678ed3635a..f1587b387332e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -130,11 +130,13 @@ class MultiIndex(Index): The unique labels for each level codes : sequence of arrays Integers for each level designating which label at each location + + .. versionadded:: 0.24.0 labels : sequence of arrays + Integers for each level designating which label at each location + .. deprecated:: 0.24.0 Use ``codes`` instead - - Integers for each level designating which label at each location sortorder : optional int Level of sortedness (must be lexicographically sorted by that level) @@ -178,7 +180,6 @@ class MultiIndex(Index): names levels codes - labels nlevels levshape @@ -189,7 +190,6 @@ class MultiIndex(Index): from_product set_levels set_codes - set_labels to_frame to_flat_index is_lexsorted @@ -1206,7 +1206,7 @@ def dropna(self, how='any'): else: raise ValueError("invalid how option: {0}".format(how)) - new_codes = [label[~indexer] for label in self.codes] + new_codes = [level_codes[~indexer] for level_codes in self.codes] return self.copy(codes=new_codes, deep=True) def get_value(self, series, key): @@ -1677,11 +1677,11 @@ def __getitem__(self, key): key = com.cast_scalar_indexer(key) retval = [] - for lev, lab in zip(self.levels, self.codes): - if lab[key] == -1: + for lev, level_codes in zip(self.levels, self.codes): + if level_codes[key] == -1: retval.append(np.nan) else: - retval.append(lev[lab[key]]) + retval.append(lev[level_codes[key]]) return tuple(retval) else: @@ -1929,7 +1929,7 @@ def reorder_levels(self, order): def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) - def _get_labels_for_sorting(self): + def _get_codes_for_sorting(self): """ we categorizing our codes by using the available categories (all, not just observed) @@ -2754,26 +2754,26 @@ def equals(self, other): return False for i in range(self.nlevels): - slabels = self.codes[i] - slabels = slabels[slabels != -1] - svalues = algos.take_nd(np.asarray(self.levels[i]._values), - slabels, allow_fill=False) - - olabels = other.codes[i] - olabels = olabels[olabels != -1] - ovalues = algos.take_nd( + self_codes = self.codes[i] + self_codes = self_codes[self_codes != -1] + self_values = algos.take_nd(np.asarray(self.levels[i]._values), + self_codes, allow_fill=False) + + other_codes = other.codes[i] + other_codes = other_codes[other_codes != -1] + other_values = algos.take_nd( np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + other_codes, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say # timedelta64 in self (IOW it has other values than NaT) # but types datetime64 in other (where its all NaT) # but these are equivalent - if len(svalues) == 0 and len(ovalues) == 0: + if len(self_values) == 0 and len(other_values) == 0: continue - if not array_equivalent(svalues, ovalues): + if not array_equivalent(self_values, other_values): return False return True @@ -2951,7 +2951,7 @@ def insert(self, loc, item): if k not in level: # have to insert into level # must insert at end otherwise you have to recompute all the - # other labels + # other codes lev_loc = len(level) level = level.insert(lev_loc, k) else: @@ -2989,13 +2989,13 @@ def isin(self, values, level=None): else: num = self._get_level_number(level) levs = self.levels[num] - labs = self.codes[num] + level_codes = self.codes[num] sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(labs), dtype=np.bool_) + return np.zeros(len(level_codes), dtype=np.bool_) else: - return np.lib.arraysetops.in1d(labs, sought_labels) + return np.lib.arraysetops.in1d(level_codes, sought_labels) MultiIndex._add_numeric_methods_disabled() diff --git a/pandas/core/series.py b/pandas/core/series.py index 47edd29f2bbf0..c9ef2bc9f8a3c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2829,7 +2829,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer labels = index._sort_levels_monotonic() - indexer = lexsort_indexer(labels._get_labels_for_sorting(), + indexer = lexsort_indexer(labels._get_codes_for_sorting(), orders=ascending, na_position=na_position) else: From 6ae2efdedb084c610dbe740978d4b018bc2e9ac2 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 21 Nov 2018 23:37:07 +0000 Subject: [PATCH 07/12] Change docs according to #23752 --- doc/source/advanced.rst | 7 ++++++- doc/source/api.rst | 4 ++-- doc/source/dsintro.rst | 2 +- doc/source/indexing.rst | 6 +++--- doc/source/internals.rst | 10 +++++----- doc/source/io.rst | 4 ++-- 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 24a1ac7be7d1d..e16b2652ab7d4 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -49,6 +49,11 @@ analysis. See the :ref:`cookbook` for some advanced strategies. +.. versionchanged:: 0.24.0 + + :attr:`MultiIndex.labels` has been renamed to :attr:`MultiIndex.codes` + and :attr:`MultiIndex.set_labels` to :attr:`MultiIndex.set_codes`. + Creating a MultiIndex (hierarchical index) object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -469,7 +474,7 @@ values across a level. For instance: .. ipython:: python midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], - labels=[[1, 1, 0, 0], [1, 0, 1, 0]]) + codes=[[1, 1, 0, 0],[1, 0, 1, 0]]) df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) diff --git a/doc/source/api.rst b/doc/source/api.rst index 82ae58acc4974..1a23587d2ebb5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1712,7 +1712,7 @@ MultiIndex Attributes MultiIndex.names MultiIndex.levels - MultiIndex.labels + MultiIndex.codes MultiIndex.nlevels MultiIndex.levshape @@ -1723,7 +1723,7 @@ MultiIndex Components :toctree: generated/ MultiIndex.set_levels - MultiIndex.set_labels + MultiIndex.set_codes MultiIndex.to_hierarchical MultiIndex.to_flat_index MultiIndex.to_frame diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 6195212873e75..968b30d7e9e2b 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -961,7 +961,7 @@ From DataFrame using ``to_panel`` method .. ipython:: python :okwarning: - midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]]) + midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], codes=[[1,1,0,0],[1,0,1,0]]) df = pd.DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx) df.to_panel() diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index dc0c6dd027b3c..6ad9c573249a3 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1571,9 +1571,9 @@ Setting metadata Indexes are "mostly immutable", but it is possible to set and change their metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``labels``). +``codes``). -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` +You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` to set these attributes directly. They default to returning a copy; however, you can specify ``inplace=True`` to have the data change in place. @@ -1588,7 +1588,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. ind.name = "bob" ind -``set_names``, ``set_levels``, and ``set_labels`` also take an optional +``set_names``, ``set_levels``, and ``set_codes`` also take an optional `level`` argument .. ipython:: python diff --git a/doc/source/internals.rst b/doc/source/internals.rst index fdf18aa47416b..c39dafa88db92 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -74,7 +74,7 @@ MultiIndex ~~~~~~~~~~ Internally, the ``MultiIndex`` consists of a few things: the **levels**, the -integer **labels**, and the level **names**: +integer **codes** (until version 0.24 named *labels*), and the level **names**: .. ipython:: python @@ -82,15 +82,15 @@ integer **labels**, and the level **names**: names=['first', 'second']) index index.levels - index.labels + index.codes index.names -You can probably guess that the labels determine which unique element is +You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It's important to -note that sortedness is determined **solely** from the integer labels and does +note that sortedness is determined **solely** from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but -if you compute the levels and labels yourself, please be careful. +if you compute the levels and codes yourself, please be careful. Values ~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index fbd238586c776..313c4d723d079 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3728,8 +3728,8 @@ storing/selecting from homogeneous index ``DataFrames``. index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['foo', 'bar']) df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) From 34e7ec55d82b8d78a9e7803ddc8f355eb3fa7244 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 22 Nov 2018 18:39:06 +0000 Subject: [PATCH 08/12] Update labels -> codes in various locations --- asv_bench/benchmarks/multiindex_object.py | 4 +-- asv_bench/benchmarks/stat_ops.py | 16 ++++++------ doc/source/whatsnew/v0.24.0.rst | 4 +-- pandas/core/indexes/multi.py | 12 ++++----- pandas/tests/indexing/test_multiindex.py | 30 +++++++++++------------ pandas/tests/io/parser/test_header.py | 8 +++--- pandas/tests/io/parser/test_index_col.py | 2 +- 7 files changed, 37 insertions(+), 39 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index ff202322dbe84..adc6730dcd946 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -79,8 +79,8 @@ def setup(self): levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] - labels = [np.random.choice(n, (k * n)) for lev in levels] - self.mi = MultiIndex(levels=levels, labels=labels) + codes = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, codes=codes) def time_duplicated(self): self.mi.duplicated() diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 66ded52ca35b2..500e4d74d4c4f 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -31,10 +31,10 @@ class FrameMultiIndexOps(object): def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - labels = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] - index = pd.MultiIndex(levels=levels, labels=labels) + codes = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, codes=codes) df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) @@ -67,10 +67,10 @@ class SeriesMultiIndexOps(object): def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - labels = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] - index = pd.MultiIndex(levels=levels, labels=labels) + codes = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, codes=codes) s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4e800a6f2c257..67e65c185b33e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1102,9 +1102,9 @@ Deprecations - :attr:`MultiIndex.labels` has been deprecated and replaced by :attr:`MultiIndex.codes`. The functionality is unchanged. This new name better reflects the natures of - these codes and makes the API more similar to the API for + these codes and makes the ``MultiIndex`` API more similar to the API for :class:`CategoricalIndex`(:issue:`13443`). - As a consequence, other uses of the name ``labels`` have also been deprecated in ``MultiIndex`` and replaced with ``codes``: + As a consequence, other uses of the name ``labels`` in ``MultiIndex`` have also been deprecated and replaced with ``codes``: - You should initialize a ``MultiIndex`` instance using a parameter named ``codes`` rather than ``labels``. - :meth:`MultiIndex.set_labels` has been deprecated in favor of :meth:`MultiIndex.set_codes` - for method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f1587b387332e..97ed69a427cd2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -11,7 +11,7 @@ from pandas.compat import lrange, lzip, map, range, zip from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, @@ -31,8 +31,6 @@ from pandas.io.formats.printing import pprint_thing -from pandas.util._decorators import deprecate_kwarg - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='MultiIndex', @@ -655,19 +653,19 @@ def set_codes(self, codes, level=None, inplace=False, >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'), (2, u'one'), (2, u'two')], names=['foo', 'bar']) - >>> idx.set_labels([[1,0,1,0], [0,0,1,1]]) + >>> idx.set_codes([[1,0,1,0], [0,0,1,1]]) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 0, 1, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([1,0,1,0], level=0) + >>> idx.set_codes([1,0,1,0], level=0) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 1, 0, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([0,0,1,1], level='bar') + >>> idx.set_codes([0,0,1,1], level='bar') MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[0, 0, 1, 1], [0, 0, 1, 1]], names=[u'foo', u'bar']) - >>> idx.set_labels([[1,0,1,0], [0,0,1,1]], level=[0,1]) + >>> idx.set_codes([[1,0,1,0], [0,0,1,1]], level=[0,1]) MultiIndex(levels=[[1, 2], [u'one', u'two']], labels=[[1, 0, 1, 0], [0, 0, 1, 1]], names=[u'foo', u'bar']) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index d9cb9f55b7c8e..8dffd08024ebc 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -24,8 +24,8 @@ def multiindex_dataframe_random_data(): """DataFrame with 2 level MultiIndex with random data""" index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) return DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) @@ -35,7 +35,7 @@ def multiindex_dataframe_random_data(): def single_level_multiindex(): """single level MultiIndex""" return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + codes=[[0, 1, 2, 3]], names=['first']) @pytest.fixture @@ -900,8 +900,8 @@ def test_frame_getitem_setitem_slice( def test_frame_getitem_setitem_multislice(self): levels = [['t1', 't2'], ['a', 'b', 'c']] - labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id']) df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) result = df.loc[:, 'value'] @@ -1044,9 +1044,9 @@ def test_xs_partial(self, multiindex_dataframe_random_data, # ex from #1796 index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - labels=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) + codes=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, + 0, 1]]) df = DataFrame(np.random.randn(8, 4), index=index, columns=list('abcd')) @@ -1189,7 +1189,7 @@ def test_getitem_toplevel(self, multiindex_dataframe_random_data): def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) @@ -1211,8 +1211,8 @@ def test_getitem_setitem_slice_integers(self): def test_getitem_int(self, multiindex_dataframe_random_data): levels = [[0, 1], [0, 1, 2]] - labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] - index = MultiIndex(levels=levels, labels=labels) + codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, codes=codes) frame = DataFrame(np.random.randn(6, 2), index=index) @@ -1236,7 +1236,7 @@ def test_getitem_partial( ymd = ymd.T result = ymd[2000, 2] - expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) tm.assert_frame_equal(result, expected) @@ -1279,12 +1279,12 @@ def test_fancy_slice_partial( ymd = multiindex_year_month_day_dataframe_random_data result = ymd.loc[(2000, 2):(2000, 4)] - lev = ymd.index.labels[1] + lev = ymd.index.codes[1] expected = ymd[(lev >= 1) & (lev <= 3)] tm.assert_frame_equal(result, expected) def test_getitem_partial_column_select(self): - idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) df = DataFrame(np.random.rand(3, 2), index=idx) @@ -1582,7 +1582,7 @@ def test_frame_getitem_not_sorted2(self): df2_original = df2.copy() df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True) assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 47b13ae6c50b1..38f4cc42357fa 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -236,7 +236,7 @@ def test_header_multi_index_common_format_malformed1(all_parsers): columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], [u("r"), u("s"), u("t"), u("u"), u("v")]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[u("a"), u("q")])) data = """a,a,a,b,c,c q,r,s,t,u,v @@ -255,7 +255,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], [u("r"), u("s"), u("t"), u("u"), u("v")]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], names=[None, u("q")])) data = """,a,a,b,c,c @@ -272,10 +272,10 @@ def test_header_multi_index_common_format_malformed3(all_parsers): expected = DataFrame(np.array( [[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), index=MultiIndex(levels=[[1, 7], [2, 8]], - labels=[[0, 1], [0, 1]]), + codes=[[0, 1], [0, 1]]), columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], [u("s"), u("t"), u("u"), u("v")]], - labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], names=[None, u("q")])) data = """,a,a,b,c,c q,r,s,t,u,v diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 8c2de40b46114..6421afba18f94 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -148,5 +148,5 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, index=MultiIndex( levels=[['a', 'b'], [1, 2, 3, 4]], - labels=[[0, 0, 1, 1], [0, 1, 2, 3]])) + codes=[[0, 0, 1, 1], [0, 1, 2, 3]])) tm.assert_frame_equal(result, expected) From dbd0bdc31aa1c0628f4590ddd4b02e4d03e47e37 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 27 Nov 2018 15:41:12 +0000 Subject: [PATCH 09/12] revert file additions --- pandas/tests/io/parser/header.py | 407 ---------------------------- pandas/tests/io/parser/index_col.py | 171 ------------ 2 files changed, 578 deletions(-) delete mode 100644 pandas/tests/io/parser/header.py delete mode 100644 pandas/tests/io/parser/index_col.py diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py deleted file mode 100644 index 1801e48417591..0000000000000 --- a/pandas/tests/io/parser/header.py +++ /dev/null @@ -1,407 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that the file header is properly handled or inferred -during parsing for all of the parsers defined in parsers.py -""" - -from collections import namedtuple - -import numpy as np -import pytest - -from pandas.compat import StringIO, lrange, u -from pandas.errors import ParserError - -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm - - -class HeaderTests(object): - - def test_read_with_bad_header(self): - errmsg = r"but only \d+ lines in file" - - with pytest.raises(ValueError, match=errmsg): - s = StringIO(',,') - self.read_csv(s, header=[10]) - - def test_bool_header_arg(self): - # see gh-6114 - data = """\ -MyColumn - a - b - a - b""" - for arg in [True, False]: - with pytest.raises(TypeError): - self.read_csv(StringIO(data), header=arg) - with pytest.raises(TypeError): - self.read_table(StringIO(data), header=arg) - - def test_no_header_prefix(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', - header=None) - - expected = np.array([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], dtype=np.int64) - tm.assert_almost_equal(df_pref.values, expected) - - tm.assert_index_equal(df_pref.columns, - Index(['Field0', 'Field1', 'Field2', - 'Field3', 'Field4'])) - - def test_header_with_index_col(self): - data = """foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - names = ['A', 'B', 'C'] - df = self.read_csv(StringIO(data), names=names) - - assert list(df.columns) == ['A', 'B', 'C'] - - values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - expected = DataFrame(values, index=['foo', 'bar', 'baz'], - columns=['A', 'B', 'C']) - tm.assert_frame_equal(df, expected) - - def test_header_not_first_line(self): - data = """got,to,ignore,this,line -got,to,ignore,this,line -index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - data2 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - - df = self.read_csv(StringIO(data), header=2, index_col=0) - expected = self.read_csv(StringIO(data2), header=0, index_col=0) - tm.assert_frame_equal(df, expected) - - def test_header_multi_index(self): - expected = tm.makeCustomDataframe( - 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - - data = """\ -C0,,C_l0_g0,C_l0_g1,C_l0_g2 - -C1,,C_l1_g0,C_l1_g1,C_l1_g2 -C2,,C_l2_g0,C_l2_g1,C_l2_g2 -C3,,C_l3_g0,C_l3_g1,C_l3_g2 -R0,R1,,, -R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 -R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 -R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 -R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 -R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 -""" - - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - # INVALID OPTIONS - - # names - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], names=['foo', 'bar']) - - # usecols - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], usecols=['foo', 'bar']) - - # non-numeric index_col - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=['foo', 'bar']) - - def test_header_multiindex_common_format(self): - - df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=['one', 'two'], - columns=MultiIndex.from_tuples( - [('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')])) - - # to_csv - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -,,,,,, -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # to_csv, tuples - result = self.read_csv(StringIO(data), skiprows=3, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=0) - tm.assert_frame_equal(df, result) - - # to_csv, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=3, index_col=0, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df, result) - - # common - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # common, tuples - result = self.read_csv(StringIO(data), skiprows=2, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=0) - tm.assert_frame_equal(df, result) - - # common, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=2, index_col=0, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df, result) - - # common, no index_col - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # common, no index_col, tuples - result = self.read_csv(StringIO(data), skiprows=2, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # common, no index_col, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=2, index_col=None, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # malformed case 1 - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('r'), u('s'), u('t'), - u('u'), u('v')]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[u('a'), u('q')])) - - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # malformed case 2 - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('r'), u('s'), u('t'), - u('u'), u('v')]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # mi on columns and index (malformed) - expected = DataFrame(np.array( - [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), - index=MultiIndex(levels=[[1, 7], [2, 8]], - codes=[[0, 1], [0, 1]]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('s'), u('t'), u('u'), u('v')]], - codes=[[0, 1, 2, 2], [0, 1, 2, 3]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) - tm.assert_frame_equal(expected, result) - - def test_header_names_backward_compat(self): - # #2539 - data = '1,2,3\n4,5,6' - - result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - tm.assert_frame_equal(result, expected) - - data2 = 'foo,bar,baz\n' + data - result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], - header=0) - tm.assert_frame_equal(result, expected) - - def test_read_only_header_no_rows(self): - # See gh-7773 - expected = DataFrame(columns=['a', 'b', 'c']) - - df = self.read_csv(StringIO('a,b,c')) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO('a,b,c'), index_col=False) - tm.assert_frame_equal(df, expected) - - def test_no_header(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df = self.read_table(StringIO(data), sep=',', header=None) - df_pref = self.read_table(StringIO(data), sep=',', prefix='X', - header=None) - - names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = self.read_table(StringIO(data), sep=',', names=names) - expected = np.array([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], dtype=np.int64) - tm.assert_almost_equal(df.values, expected) - tm.assert_almost_equal(df.values, df2.values) - - tm.assert_index_equal(df_pref.columns, - Index(['X0', 'X1', 'X2', 'X3', 'X4'])) - tm.assert_index_equal(df.columns, Index(lrange(5))) - - tm.assert_index_equal(df2.columns, Index(names)) - - def test_non_int_header(self): - # GH 16338 - msg = 'header must be integer or list of integers' - data = """1,2\n3,4""" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), sep=',', header=['a', 'b']) - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), sep=',', header='string_header') - - def test_singleton_header(self): - # See GH #7757 - data = """a,b,c\n0,1,2\n1,2,3""" - df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) - tm.assert_frame_equal(df, expected) - - def test_mangles_multi_index(self): - # See GH 18062 - data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.2'), ('B', 'two')])) - tm.assert_frame_equal(df, expected) - - data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.1.1'), ('B', 'two')])) - tm.assert_frame_equal(df, expected) - - data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.1.1'), ('B', 'two'), - ('B', 'two.1')])) - tm.assert_frame_equal(df, expected) - - @pytest.mark.parametrize("index_col", [None, [0]]) - @pytest.mark.parametrize("columns", [None, - (["", "Unnamed"]), - (["Unnamed", ""]), - (["Unnamed", "NotUnnamed"])]) - def test_multi_index_unnamed(self, index_col, columns): - # see gh-23687 - # - # When specifying a multi-index header, make sure that - # we don't error just because one of the rows in our header - # has ALL column names containing the string "Unnamed". The - # correct condition to check is whether the row contains - # ALL columns that did not have names (and instead were given - # placeholder ones). - header = [0, 1] - - if index_col is None: - data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" - else: - data = (",".join([""] + (columns or ["", ""])) + - "\n,0,1\n0,2,3\n1,4,5\n") - - if columns is None: - msg = (r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns") - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), header=header, - index_col=index_col) - else: - result = self.read_csv(StringIO(data), header=header, - index_col=index_col) - template = "Unnamed: {i}_level_0" - exp_columns = [] - - for i, col in enumerate(columns): - if not col: # Unnamed. - col = template.format(i=i if index_col is None else i + 1) - - exp_columns.append(col) - - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/index_col.py b/pandas/tests/io/parser/index_col.py deleted file mode 100644 index 3be610b2ade22..0000000000000 --- a/pandas/tests/io/parser/index_col.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that the specified index column (a.k.a 'index_col') -is properly handled or inferred during parsing for all of -the parsers defined in parsers.py -""" - -import pytest - -from pandas.compat import StringIO - -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm - - -class IndexColTests(object): - - def test_index_col_named(self): - no_header = """\ -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - - h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa - data = h + no_header - rs = self.read_csv(StringIO(data), index_col='ID') - xp = self.read_csv(StringIO(data), header=0).set_index('ID') - tm.assert_frame_equal(rs, xp) - - pytest.raises(ValueError, self.read_csv, StringIO(no_header), - index_col='ID') - - data = """\ -1,2,3,4,hello -5,6,7,8,world -9,10,11,12,foo -""" - names = ['a', 'b', 'c', 'd', 'message'] - xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], - 'd': [4, 8, 12]}, - index=Index(['hello', 'world', 'foo'], name='message')) - rs = self.read_csv(StringIO(data), names=names, index_col=['message']) - tm.assert_frame_equal(xp, rs) - assert xp.index.name == rs.index.name - - rs = self.read_csv(StringIO(data), names=names, index_col='message') - tm.assert_frame_equal(xp, rs) - assert xp.index.name == rs.index.name - - def test_index_col_is_true(self): - # see gh-9798 - pytest.raises(ValueError, self.read_csv, - StringIO(self.ts_data), index_col=True) - - def test_infer_index_col(self): - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - data = self.read_csv(StringIO(data)) - assert data.index.equals(Index(['foo', 'bar', 'baz'])) - - def test_empty_index_col_scenarios(self): - data = 'x,y,z' - - # None, no index - index_col, expected = None, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # False, no index - index_col, expected = False, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, first column - index_col, expected = 0, DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, not first column - index_col, expected = 1, DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, first column - index_col, expected = 'x', DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, not the first column - index_col, expected = 'y', DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # list of int - index_col, expected = [0, 1], DataFrame( - [], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), - expected, check_index_type=False) - - # list of str - index_col = ['x', 'y'] - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv(StringIO( - data), index_col=index_col), - expected, check_index_type=False) - - # list of int, reversed sequence - index_col = [1, 0] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), - expected, check_index_type=False) - - # list of str, reversed sequence - index_col = ['y', 'x'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv(StringIO( - data), index_col=index_col), - expected, check_index_type=False) - - def test_empty_with_index_col_false(self): - # see gh-10413 - data = 'x,y' - result = self.read_csv(StringIO(data), index_col=False) - expected = DataFrame([], columns=['x', 'y']) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("index_names", [ - ["", ""], - ["foo", ""], - ["", "bar"], - ["foo", "bar"], - ["NotReallyUnnamed", "Unnamed: 0"], - ]) - def test_multi_index_naming(self, index_names): - # We don't want empty index names being replaced with "Unnamed: 0" - data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) - result = self.read_csv(StringIO(data), index_col=[0, 1]) - - expected = DataFrame({"col": [1, 2, 3, 4]}, - index=MultiIndex.from_product([["a", "b"], - ["c", "d"]])) - expected.index.names = [name if name else None for name in index_names] - tm.assert_frame_equal(result, expected) - - def test_multi_index_naming_not_all_at_beginning(self): - data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" - result = self.read_csv(StringIO(data), index_col=[0, 2]) - - expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, - index=MultiIndex( - levels=[['a', 'b'], [1, 2, 3, 4]], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]])) - tm.assert_frame_equal(result, expected) From 4341ca1627799534e0d295fc222a358fd9ea93d1 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 29 Nov 2018 22:23:56 +0000 Subject: [PATCH 10/12] fixing rebasing issues --- pandas/core/indexes/base.py | 91 +- pandas/core/indexes/multi.py | 18 +- pandas/tests/indexes/multi/test_copy.py | 3 +- pandas/tests/indexing/multiindex/conftest.py | 4 +- .../tests/indexing/multiindex/test_getitem.py | 12 +- pandas/tests/indexing/multiindex/test_loc.py | 6 +- .../tests/indexing/multiindex/test_partial.py | 12 +- .../tests/indexing/multiindex/test_setitem.py | 6 +- .../tests/indexing/multiindex/test_sorted.py | 2 +- pandas/tests/indexing/test_multiindex.py | 2249 ----------------- pandas/tests/util/test_hashing.py | 2 +- 11 files changed, 56 insertions(+), 2349 deletions(-) delete mode 100644 pandas/tests/indexing/test_multiindex.py diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4676ea632c949..88510e84a29a5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1521,19 +1521,19 @@ def droplevel(self, level=0): # The two checks above guarantee that here self is a MultiIndex new_levels = list(self.levels) - new_labels = list(self.labels) + new_codes = list(self.codes) new_names = list(self.names) for i in levnums: new_levels.pop(i) - new_labels.pop(i) + new_codes.pop(i) new_names.pop(i) if len(new_levels) == 1: # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) + mask = new_codes[0] == -1 + result = new_levels[0].take(new_codes[0]) if mask.any(): result = result.putmask(mask, np.nan) @@ -1541,7 +1541,7 @@ def droplevel(self, level=0): return result else: from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, + return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) _index_shared_docs['_get_grouper_for_level'] = """ @@ -3299,14 +3299,14 @@ def _join_multi(self, other, how, return_indexers=True): # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, labels, names = ( + levels, codes, names = ( _restore_dropped_levels_multijoin(self, other, dropped_names, join_idx, lidx, ridx)) # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, labels=labels, + multi_join_idx = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) multi_join_idx = multi_join_idx.remove_unused_levels() @@ -3417,7 +3417,7 @@ def _get_leaf_sorter(labels): left_indexer = None join_index = left else: # sort the leaves - left_indexer = _get_leaf_sorter(left.labels[:level + 1]) + left_indexer = _get_leaf_sorter(left.codes[:level + 1]) join_index = left[left_indexer] else: @@ -3425,55 +3425,55 @@ def _get_leaf_sorter(labels): rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) - new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], - allow_fill=False) + new_lev_codes = algos.take_nd(rev_indexer, left.codes[level], + allow_fill=False) - new_labels = list(left.labels) - new_labels[level] = new_lev_labels + new_codes = list(left.codes) + new_codes[level] = new_lev_codes new_levels = list(left.levels) new_levels[level] = new_level if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_lev_labels != -1 + mask = new_lev_codes != -1 if not mask.all(): - new_labels = [lab[mask] for lab in new_labels] + new_codes = [lab[mask] for lab in new_codes] left_indexer = left_indexer[mask] else: # tie out the order with other if level == 0: # outer most level, take the fast route - ngroups = 1 + new_lev_labels.max() + ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_labels, ngroups) + new_lev_codes, ngroups) # missing values are placed first; drop them! left_indexer = left_indexer[counts[0]:] - new_labels = [lab[left_indexer] for lab in new_labels] + new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves - mask = new_lev_labels != -1 + mask = new_lev_codes != -1 mask_all = mask.all() if not mask_all: - new_labels = [lab[mask] for lab in new_labels] + new_codes = [lab[mask] for lab in new_codes] - left_indexer = _get_leaf_sorter(new_labels[:level + 1]) - new_labels = [lab[left_indexer] for lab in new_labels] + left_indexer = _get_leaf_sorter(new_codes[:level + 1]) + new_codes = [lab[left_indexer] for lab in new_codes] # left_indexers are w.r.t masked frame. # reverse to original frame! if not mask_all: left_indexer = mask.nonzero()[0][left_indexer] - join_index = MultiIndex(levels=new_levels, labels=new_labels, + join_index = MultiIndex(levels=new_levels, codes=new_codes, names=left.names, verify_integrity=False) if right_lev_indexer is not None: right_indexer = algos.take_nd(right_lev_indexer, - join_index.labels[level], + join_index.codes[level], allow_fill=False) else: - right_indexer = join_index.labels[level] + right_indexer = join_index.codes[level] if flip_order: left_indexer, right_indexer = right_indexer, left_indexer @@ -4103,24 +4103,12 @@ def asof_locs(self, where, mask): return result -<<<<<<< HEAD def sort_values(self, return_indexer=False, ascending=True): """ Return a sorted copy of the index. Return a sorted copy of the index, and optionally return the indices that sorted the index itself. -======= - levels, codes, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) - - # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, codes=codes, - names=names, verify_integrity=False) ->>>>>>> various changes Parameters ---------- @@ -4478,24 +4466,15 @@ def isin(self, values, level=None): passed set of values. The length of the returned boolean array matches the length of the index. -<<<<<<< HEAD Parameters ---------- values : set or list-like Sought values. .. versionadded:: 0.18.1 -======= - new_level_codes = algos.take_nd(rev_indexer, left.codes[level], - allow_fill=False) - - new_codes = list(left.codes) - new_codes[level] = new_level_codes ->>>>>>> various changes Support for values as a set. -<<<<<<< HEAD level : str or int, optional Name or position of the index level to use (if the index is a `MultiIndex`). @@ -4504,40 +4483,18 @@ def isin(self, values, level=None): ------- is_contained : ndarray NumPy array of boolean values. -======= - if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_level_codes != -1 - if not mask.all(): - new_codes = [codes_[mask] for codes_ in new_codes] - left_indexer = left_indexer[mask] - - else: # tie out the order with other - if level == 0: # outer most level, take the fast route - ngroups = 1 + new_level_codes.max() - left_indexer, counts = libalgos.groupsort_indexer( - new_level_codes, ngroups) ->>>>>>> various changes See Also -------- Series.isin : Same for Series. DataFrame.isin : Same method for DataFrames. -<<<<<<< HEAD Notes ----- In the case of `MultiIndex` you must either specify `values` as a list-like object containing tuples that are the same length as the number of levels, or specify `level`. Otherwise it will raise a ``ValueError``. -======= - else: # sort the leaves - mask = new_level_codes != -1 - mask_all = mask.all() - if not mask_all: - new_codes = [lab[mask] for lab in new_codes] ->>>>>>> various changes If `level` is specified: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 97ed69a427cd2..c779281cfcf16 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -327,11 +327,11 @@ def from_arrays(cls, arrays, sortorder=None, names=None): from pandas.core.arrays.categorical import _factorize_from_iterables - labels, levels = _factorize_from_iterables(arrays) + codes, levels = _factorize_from_iterables(arrays) if names is None: names = [getattr(arr, "name", None) for arr in arrays] - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + return MultiIndex(levels=levels, codes=codes, sortorder=sortorder, names=names, verify_integrity=False) @classmethod @@ -427,9 +427,9 @@ def from_product(cls, iterables, sortorder=None, names=None): elif is_iterator(iterables): iterables = list(iterables) - labels, levels = _factorize_from_iterables(iterables) - labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) + codes, levels = _factorize_from_iterables(iterables) + codes = cartesian_product(codes) + return MultiIndex(levels, codes, sortorder=sortorder, names=names) # -------------------------------------------------------------------- @@ -873,15 +873,15 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, return [] stringified_levels = [] - for lev, lab in zip(self.levels, self.labels): + for lev, level_codes in zip(self.levels, self.codes): na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) if len(lev) > 0: - formatted = lev.take(lab).format(formatter=formatter) + formatted = lev.take(level_codes).format(formatter=formatter) # we have some NA - mask = lab == -1 + mask = level_codes == -1 if mask.any(): formatted = np.array(formatted, dtype=object) formatted[mask] = na @@ -891,7 +891,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, # weird all NA case formatted = [pprint_thing(na if isna(x) else x, escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, lab)] + for x in algos.take_1d(lev._values, level_codes)] stringified_levels.append(formatted) result_levels = [] diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 1379e0d85f860..aaf2fe1cb635f 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -39,9 +39,8 @@ def test_shallow_copy(idx): def test_labels_deprecated(idx): # GH23752 - codes = idx.codes with tm.assert_produces_warning(FutureWarning): - idx.copy(labels=codes) + idx.copy(labels=idx.codes) def test_view(idx): diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index f578fe7c0f60f..046fc19c0d9c8 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -10,8 +10,8 @@ def multiindex_dataframe_random_data(): """DataFrame with 2 level MultiIndex with random data""" index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) return DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index efc1ebcbecee7..00b30bab37441 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -64,8 +64,8 @@ def test_getitem_duplicates_multiindex(self): index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], - labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) arr = np.random.randn(len(index), 1) df = DataFrame(arr, index=index, columns=['val']) @@ -87,8 +87,8 @@ def f(): # A is treated as a special Timestamp index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], - labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) df = DataFrame(arr, index=index, columns=['val']) result = df.val['A'] @@ -264,8 +264,8 @@ def test_getitem_toplevel(self, multiindex_dataframe_random_data): def test_getitem_int(self, multiindex_dataframe_random_data): levels = [[0, 1], [0, 1, 2]] - labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] - index = MultiIndex(levels=levels, labels=labels) + codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, codes=codes) frame = DataFrame(np.random.randn(6, 2), index=index) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index f31685641753e..47a46bc05d0d9 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -11,7 +11,7 @@ def single_level_multiindex(): """single level MultiIndex""" return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + codes=[[0, 1, 2, 3]], names=['first']) @pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning") @@ -40,7 +40,7 @@ def test_loc_getitem_series(self): empty = Series(data=[], dtype=np.float64) expected = Series([], index=MultiIndex( - levels=index.levels, labels=[[], []], dtype=np.float64)) + levels=index.levels, codes=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected) @@ -60,7 +60,7 @@ def test_loc_getitem_array(self): # empty array: empty = np.array([]) expected = Series([], index=MultiIndex( - levels=index.levels, labels=[[], []], dtype=np.float64)) + levels=index.levels, codes=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index dc2bd4d36e9fb..2e37ebe4a0629 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -52,9 +52,9 @@ def test_xs_partial(self, multiindex_dataframe_random_data, # ex from #1796 index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - labels=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) + codes=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, + 0, 1]]) df = DataFrame(np.random.randn(8, 4), index=index, columns=list('abcd')) @@ -68,7 +68,7 @@ def test_getitem_partial( ymd = ymd.T result = ymd[2000, 2] - expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) tm.assert_frame_equal(result, expected) @@ -82,12 +82,12 @@ def test_fancy_slice_partial( ymd = multiindex_year_month_day_dataframe_random_data result = ymd.loc[(2000, 2):(2000, 4)] - lev = ymd.index.labels[1] + lev = ymd.index.codes[1] expected = ymd[(lev >= 1) & (lev <= 3)] tm.assert_frame_equal(result, expected) def test_getitem_partial_column_select(self): - idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) df = DataFrame(np.random.rand(3, 2), index=idx) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 7288983f5f04b..bc00481ddfd90 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -277,8 +277,8 @@ def test_frame_getitem_setitem_boolean( def test_frame_getitem_setitem_multislice(self): levels = [['t1', 't2'], ['a', 'b', 'c']] - labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id']) df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) result = df.loc[:, 'value'] @@ -350,7 +350,7 @@ def test_getitem_setitem_tuple_plus_columns( def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 898959d74383a..f565c30fc3e2c 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -39,7 +39,7 @@ def test_frame_getitem_not_sorted2(self): df2_original = df2.copy() df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True) assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py deleted file mode 100644 index 8dffd08024ebc..0000000000000 --- a/pandas/tests/indexing/test_multiindex.py +++ /dev/null @@ -1,2249 +0,0 @@ -from datetime import datetime -from warnings import catch_warnings, simplefilter - -import numpy as np -from numpy.random import randn -import pytest - -import pandas._libs.index as _index -from pandas.compat import ( - StringIO, lrange, lzip, product as cart_product, range, u, zip) -from pandas.errors import PerformanceWarning, UnsortedIndexError - -import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Panel, Period, Series, Timestamp, concat, - date_range, isna, notna, period_range, read_csv) -import pandas.core.common as com -from pandas.tests.indexing.common import _mklbl -from pandas.util import testing as tm - - -@pytest.fixture -def multiindex_dataframe_random_data(): - """DataFrame with 2 level MultiIndex with random data""" - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - - -@pytest.fixture -def single_level_multiindex(): - """single level MultiIndex""" - return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) - - -@pytest.fixture -def multiindex_year_month_day_dataframe_random_data(): - """DataFrame with 3 level MultiIndex (year, month, day) covering - first 100 business days from 2000-01-01 with random data""" - tm.N = 100 - tdf = tm.makeTimeDataFrame() - ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]).sum() - # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels], - inplace=True) - ymd.index.set_names(['year', 'month', 'day'], inplace=True) - return ymd - - -@pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning") -class TestMultiIndexBasic(object): - - def test_iloc_getitem_multiindex2(self): - # TODO(wesm): fix this - pytest.skip('this test was being suppressed, ' - 'needs to be fixed') - - arr = np.random.randn(3, 3) - df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - rs = df.iloc[2] - xp = Series(arr[2], index=df.columns) - tm.assert_series_equal(rs, xp) - - rs = df.iloc[:, 2] - xp = Series(arr[:, 2], index=df.index) - tm.assert_series_equal(rs, xp) - - rs = df.iloc[2, 2] - xp = df.values[2, 2] - assert rs == xp - - # for multiple items - # GH 5528 - rs = df.iloc[[0, 1]] - xp = df.xs(4, drop_level=False) - tm.assert_frame_equal(rs, xp) - - tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) - index = MultiIndex.from_tuples(tup) - df = DataFrame(np.random.randn(4, 4), index=index) - rs = df.iloc[[2, 3]] - xp = df.xs('b', drop_level=False) - tm.assert_frame_equal(rs, xp) - - def test_setitem_multiindex(self): - with catch_warnings(record=True): - - for index_fn in ('ix', 'loc'): - - def assert_equal(a, b): - assert a == b - - def check(target, indexers, value, compare_fn, expected=None): - fn = getattr(target, index_fn) - fn.__setitem__(indexers, value) - result = fn.__getitem__(indexers) - if expected is None: - expected = value - compare_fn(result, expected) - # GH7190 - index = MultiIndex.from_product([np.arange(0, 100), - np.arange(0, 80)], - names=['time', 'firm']) - t, n = 0, 2 - df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=0, - compare_fn=assert_equal) - - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=1, - compare_fn=assert_equal) - - df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=2, - compare_fn=assert_equal) - - # gh-7218: assigning with 0-dim arrays - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, - indexers=((t, n), 'X'), - value=np.array(3), - compare_fn=assert_equal, - expected=3, ) - - # GH5206 - df = DataFrame(np.arange(25).reshape(5, 5), - columns='A,B,C,D,E'.split(','), dtype=float) - df['F'] = 99 - row_selection = df['A'] % 2 == 0 - col_selection = ['B', 'C'] - with catch_warnings(record=True): - df.ix[row_selection, col_selection] = df['F'] - output = DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - with catch_warnings(record=True): - tm.assert_frame_equal(df.ix[row_selection, col_selection], - output) - check(target=df, - indexers=(row_selection, col_selection), - value=df['F'], - compare_fn=tm.assert_frame_equal, - expected=output, ) - - # GH11372 - idx = MultiIndex.from_product([ - ['A', 'B', 'C'], - date_range('2015-01-01', '2015-04-01', freq='MS')]) - cols = MultiIndex.from_product([ - ['foo', 'bar'], - date_range('2016-01-01', '2016-02-01', freq='MS')]) - - df = DataFrame(np.random.random((12, 4)), - index=idx, columns=cols) - - subidx = MultiIndex.from_tuples( - [('A', Timestamp('2015-01-01')), - ('A', Timestamp('2015-02-01'))]) - subcols = MultiIndex.from_tuples( - [('foo', Timestamp('2016-01-01')), - ('foo', Timestamp('2016-02-01'))]) - - vals = DataFrame(np.random.random((2, 2)), - index=subidx, columns=subcols) - check(target=df, - indexers=(subidx, subcols), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # set all columns - vals = DataFrame( - np.random.random((2, 4)), index=subidx, columns=cols) - check(target=df, - indexers=(subidx, slice(None, None, None)), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # identity - copy = df.copy() - check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=tm.assert_frame_equal, expected=copy) - - def test_loc_getitem_series(self): - # GH14730 - # passing a series as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) - x = Series(index=index, data=range(9), dtype=np.float64) - y = Series([1, 3]) - expected = Series( - data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) - result = x.loc[y] - tm.assert_series_equal(result, expected) - - result = x.loc[[1, 3]] - tm.assert_series_equal(result, expected) - - # GH15424 - y1 = Series([1, 3], index=[1, 2]) - result = x.loc[y1] - tm.assert_series_equal(result, expected) - - empty = Series(data=[], dtype=np.float64) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) - result = x.loc[empty] - tm.assert_series_equal(result, expected) - - def test_loc_getitem_array(self): - # GH15434 - # passing an array as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) - x = Series(index=index, data=range(9), dtype=np.float64) - y = np.array([1, 3]) - expected = Series( - data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) - result = x.loc[y] - tm.assert_series_equal(result, expected) - - # empty array: - empty = np.array([]) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) - result = x.loc[empty] - tm.assert_series_equal(result, expected) - - # 0-dim array (scalar): - scalar = np.int64(1) - expected = Series( - data=[0, 1, 2], - index=['A', 'B', 'C'], - dtype=np.float64) - result = x.loc[scalar] - tm.assert_series_equal(result, expected) - - def test_iloc_getitem_multiindex(self): - mi_codes = DataFrame(np.random.randn(4, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j', 'k'], - ['X', 'X', 'Y', 'Y']]) - - mi_int = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - # the first row - rs = mi_int.iloc[0] - with catch_warnings(record=True): - xp = mi_int.ix[4].ix[8] - tm.assert_series_equal(rs, xp, check_names=False) - assert rs.name == (4, 8) - assert xp.name == 8 - - # 2nd (last) columns - rs = mi_int.iloc[:, 2] - with catch_warnings(record=True): - xp = mi_int.ix[:, 2] - tm.assert_series_equal(rs, xp) - - # corner column - rs = mi_int.iloc[2, 2] - with catch_warnings(record=True): - # First level is int - so use .loc rather than .ix (GH 21593) - xp = mi_int.loc[(8, 12), (4, 10)] - assert rs == xp - - # this is basically regular indexing - rs = mi_codes.iloc[2, 2] - with catch_warnings(record=True): - xp = mi_codes.ix['j'].ix[:, 'j'].ix[0, 0] - assert rs == xp - - def test_loc_multiindex(self): - - mi_codes = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) - - mi_int = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) - - # the first row - rs = mi_codes.loc['i'] - with catch_warnings(record=True): - xp = mi_codes.ix['i'] - tm.assert_frame_equal(rs, xp) - - # 2nd (last) columns - rs = mi_codes.loc[:, 'j'] - with catch_warnings(record=True): - xp = mi_codes.ix[:, 'j'] - tm.assert_frame_equal(rs, xp) - - # corner column - rs = mi_codes.loc['j'].loc[:, 'j'] - with catch_warnings(record=True): - xp = mi_codes.ix['j'].ix[:, 'j'] - tm.assert_frame_equal(rs, xp) - - # with a tuple - rs = mi_codes.loc[('i', 'X')] - with catch_warnings(record=True): - xp = mi_codes.ix[('i', 'X')] - tm.assert_frame_equal(rs, xp) - - rs = mi_int.loc[4] - with catch_warnings(record=True): - xp = mi_int.ix[4] - tm.assert_frame_equal(rs, xp) - - # missing label - pytest.raises(KeyError, lambda: mi_int.loc[2]) - with catch_warnings(record=True): - # GH 21593 - pytest.raises(KeyError, lambda: mi_int.ix[2]) - - def test_getitem_partial_int(self): - # GH 12416 - # with single item - l1 = [10, 20] - l2 = ['a', 'b'] - df = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1, l2])) - expected = DataFrame(index=range(2), - columns=l2) - result = df[20] - tm.assert_frame_equal(result, expected) - - # with list - expected = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1[1:], l2])) - result = df[[20]] - tm.assert_frame_equal(result, expected) - - # missing item: - with pytest.raises(KeyError, match='1'): - df[1] - with pytest.raises(KeyError, match=r"'\[1\] not in index'"): - df[[1]] - - def test_loc_multiindex_indexer_none(self): - - # GH6788 - # multi-index indexer is None (meaning take all) - attributes = ['Attribute' + str(i) for i in range(1)] - attribute_values = ['Value' + str(i) for i in range(5)] - - index = MultiIndex.from_product([attributes, attribute_values]) - df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 - df = DataFrame(df, columns=index) - result = df[attributes] - tm.assert_frame_equal(result, df) - - # GH 7349 - # loc with a multi-index seems to be doing fallback - df = DataFrame(np.arange(12).reshape(-1, 1), - index=MultiIndex.from_product([[1, 2, 3, 4], - [1, 2, 3]])) - - expected = df.loc[([1, 2], ), :] - result = df.loc[[1, 2]] - tm.assert_frame_equal(result, expected) - - def test_loc_multiindex_incomplete(self): - - # GH 7399 - # incomplete indexers - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.loc[:, 'a':'c'] - - result = s.loc[0:4, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - result = s.loc[:4, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - result = s.loc[0:, 'a':'c'] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected) - - # GH 7400 - # multiindexer gettitem with list of indexers skips wrong element - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.iloc[[6, 7, 8, 12, 13, 14]] - result = s.loc[2:4:2, 'a':'c'] - tm.assert_series_equal(result, expected) - - def test_multiindex_perf_warn(self): - - df = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) - - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.index]): - df.loc[(1, 'z')] - - df = df.iloc[[2, 1, 3, 0]] - with tm.assert_produces_warning(PerformanceWarning): - df.loc[(0, )] - - def test_series_getitem_multiindex(self): - - # GH 6018 - # series regression getitem with a multi-index - - s = Series([1, 2, 3]) - s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) - - result = s[:, 0] - expected = Series([1], index=[0]) - tm.assert_series_equal(result, expected) - - result = s.loc[:, 1] - expected = Series([2, 3], index=[1, 2]) - tm.assert_series_equal(result, expected) - - # xs - result = s.xs(0, level=0) - expected = Series([1], index=[0]) - tm.assert_series_equal(result, expected) - - result = s.xs(1, level=1) - expected = Series([2, 3], index=[1, 2]) - tm.assert_series_equal(result, expected) - - # GH6258 - dt = list(date_range('20130903', periods=3)) - idx = MultiIndex.from_product([list('AB'), dt]) - s = Series([1, 3, 4, 1, 3, 4], index=idx) - - result = s.xs('20130903', level=1) - expected = Series([1, 1], index=list('AB')) - tm.assert_series_equal(result, expected) - - # GH5684 - idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')]) - s = Series([1, 2, 3, 4], index=idx) - s.index.set_names(['L1', 'L2'], inplace=True) - result = s.xs('one', level='L2') - expected = Series([1, 3], index=['a', 'b']) - expected.index.set_names(['L1'], inplace=True) - tm.assert_series_equal(result, expected) - - def test_xs_multiindex(self): - - # GH2903 - columns = MultiIndex.from_tuples( - [('a', 'foo'), ('a', 'bar'), ('b', 'hello'), - ('b', 'world')], names=['lvl0', 'lvl1']) - df = DataFrame(np.random.randn(4, 4), columns=columns) - df.sort_index(axis=1, inplace=True) - result = df.xs('a', level='lvl0', axis=1) - expected = df.iloc[:, 0:2].loc[:, 'a'] - tm.assert_frame_equal(result, expected) - - result = df.xs('foo', level='lvl1', axis=1) - expected = df.iloc[:, 1:2].copy() - expected.columns = expected.columns.droplevel('lvl1') - tm.assert_frame_equal(result, expected) - - def test_multiindex_setitem(self): - - # GH 3738 - # setting with a multi-index right hand side - arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), - np.array(['one', 'two', 'one', 'one', 'two', 'one']), - np.arange(0, 6, 1)] - - df_orig = DataFrame(np.random.randn(6, 3), index=arrays, - columns=['A', 'B', 'C']).sort_index() - - expected = df_orig.loc[['bar']] * 2 - df = df_orig.copy() - df.loc[['bar']] *= 2 - tm.assert_frame_equal(df.loc[['bar']], expected) - - # raise because these have differing levels - def f(): - df.loc['bar'] *= 2 - - pytest.raises(TypeError, f) - - # from SO - # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation - df_orig = DataFrame.from_dict({'price': { - ('DE', 'Coal', 'Stock'): 2, - ('DE', 'Gas', 'Stock'): 4, - ('DE', 'Elec', 'Demand'): 1, - ('FR', 'Gas', 'Stock'): 5, - ('FR', 'Solar', 'SupIm'): 0, - ('FR', 'Wind', 'SupIm'): 0 - }}) - df_orig.index = MultiIndex.from_tuples(df_orig.index, - names=['Sit', 'Com', 'Type']) - - expected = df_orig.copy() - expected.iloc[[0, 2, 3]] *= 2 - - idx = pd.IndexSlice - df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], :] *= 2 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], 'price'] *= 2 - tm.assert_frame_equal(df, expected) - - def test_getitem_duplicates_multiindex(self): - # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise - # the appropriate error, only in PY3 of course! - - index = MultiIndex(levels=[['D', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) - arr = np.random.randn(len(index), 1) - df = DataFrame(arr, index=index, columns=['val']) - result = df.val['D'] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) - tm.assert_series_equal(result, expected) - - def f(): - df.val['A'] - - pytest.raises(KeyError, f) - - def f(): - df.val['X'] - - pytest.raises(KeyError, f) - - # A is treated as a special Timestamp - index = MultiIndex(levels=[['A', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) - df = DataFrame(arr, index=index, columns=['val']) - result = df.val['A'] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) - tm.assert_series_equal(result, expected) - - def f(): - df.val['X'] - - pytest.raises(KeyError, f) - - # GH 7866 - # multi-index slicing with missing indexers - idx = MultiIndex.from_product([['A', 'B', 'C'], - ['foo', 'bar', 'baz']], - names=['one', 'two']) - s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() - - exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], - names=['one', 'two']) - expected = Series(np.arange(3, dtype='int64'), - index=exp_idx).sort_index() - - result = s.loc[['A']] - tm.assert_series_equal(result, expected) - result = s.loc[['A', 'D']] - tm.assert_series_equal(result, expected) - - # not any values found - pytest.raises(KeyError, lambda: s.loc[['D']]) - - # empty ok - result = s.loc[[]] - expected = s.iloc[[]] - tm.assert_series_equal(result, expected) - - idx = pd.IndexSlice - expected = Series([0, 3, 6], index=MultiIndex.from_product( - [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() - - result = s.loc[idx[:, ['foo']]] - tm.assert_series_equal(result, expected) - result = s.loc[idx[:, ['foo', 'bah']]] - tm.assert_series_equal(result, expected) - - # GH 8737 - # empty indexer - multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], - ['alpha', 'beta'])) - df = DataFrame( - np.random.randn(5, 6), index=range(5), columns=multi_index) - df = df.sort_index(level=0, axis=1) - - expected = DataFrame(index=range(5), - columns=multi_index.reindex([])[0]) - result1 = df.loc[:, ([], slice(None))] - result2 = df.loc[:, (['foo'], [])] - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # regression from < 0.14.0 - # GH 7914 - df = DataFrame([[np.mean, np.median], ['mean', 'median']], - columns=MultiIndex.from_tuples([('functs', 'mean'), - ('functs', 'median')]), - index=['function', 'name']) - result = df.loc['function', ('functs', 'mean')] - assert result == np.mean - - def test_multiindex_assignment(self): - - # GH3777 part 2 - - # mixed dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - df['d'] = np.nan - arr = np.array([0., 1.]) - - with catch_warnings(record=True): - df.ix[4, 'd'] = arr - tm.assert_series_equal(df.ix[4, 'd'], - Series(arr, index=[8, 10], name='d')) - - # single dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - - with catch_warnings(record=True): - df.ix[4, 'c'] = arr - exp = Series(arr, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) - - # scalar ok - with catch_warnings(record=True): - df.ix[4, 'c'] = 10 - exp = Series(10, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) - - # invalid assignments - def f(): - with catch_warnings(record=True): - df.ix[4, 'c'] = [0, 1, 2, 3] - - pytest.raises(ValueError, f) - - def f(): - with catch_warnings(record=True): - df.ix[4, 'c'] = [0] - - pytest.raises(ValueError, f) - - # groupby example - NUM_ROWS = 100 - NUM_COLS = 10 - col_names = ['A' + num for num in - map(str, np.arange(NUM_COLS).tolist())] - index_cols = col_names[:5] - - df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), - dtype=np.int64, columns=col_names) - df = df.set_index(index_cols).sort_index() - grp = df.groupby(level=index_cols[:4]) - df['new_col'] = np.nan - - f_index = np.arange(5) - - def f(name, df2): - return Series(np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) - - # TODO(wesm): unused? - # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T - - # we are actually operating on a copy here - # but in this case, that's ok - for name, df2 in grp: - new_vals = np.arange(df2.shape[0]) - with catch_warnings(record=True): - df.ix[name, 'new_col'] = new_vals - - def test_multiindex_label_slicing_with_negative_step(self): - s = Series(np.arange(20), - MultiIndex.from_product([list('abcde'), np.arange(4)])) - SLC = pd.IndexSlice - - def assert_slices_equivalent(l_slc, i_slc): - tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - with catch_warnings(record=True): - tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) - - assert_slices_equivalent(SLC[::-1], SLC[::-1]) - - assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) - - assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) - - assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) - - assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) - assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) - assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) - - def test_multiindex_slice_first_level(self): - # GH 12697 - freq = ['a', 'b', 'c', 'd'] - idx = MultiIndex.from_product([freq, np.arange(500)]) - df = DataFrame(list(range(2000)), index=idx, columns=['Test']) - df_slice = df.loc[pd.IndexSlice[:, 30:70], :] - result = df_slice.loc['a'] - expected = DataFrame(list(range(30, 71)), - columns=['Test'], index=range(30, 71)) - tm.assert_frame_equal(result, expected) - result = df_slice.loc['d'] - expected = DataFrame(list(range(1530, 1571)), - columns=['Test'], index=range(30, 71)) - tm.assert_frame_equal(result, expected) - - def test_multiindex_symmetric_difference(self): - # GH 13490 - idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], - names=['a', 'b']) - result = idx ^ idx - assert result.names == idx.names - - idx2 = idx.copy().rename(['A', 'B']) - result = idx ^ idx2 - assert result.names == [None, None] - - def test_multiindex_contains_dropped(self): - # GH 19027 - # test that dropped MultiIndex levels are not in the MultiIndex - # despite continuing to be in the MultiIndex's levels - idx = MultiIndex.from_product([[1, 2], [3, 4]]) - assert 2 in idx - idx = idx.drop(2) - - # drop implementation keeps 2 in the levels - assert 2 in idx.levels[0] - # but it should no longer be in the index itself - assert 2 not in idx - - # also applies to strings - idx = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - assert 'a' in idx - idx = idx.drop('a') - assert 'a' in idx.levels[0] - assert 'a' not in idx - - @pytest.mark.parametrize("data, expected", [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), - ]) - def test_multiindex_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - - def test_getitem_simple(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.T - - col = df['foo', 'one'] - tm.assert_almost_equal(col.values, df.values[:, 0]) - with pytest.raises(KeyError): - df[('foo', 'four')] - with pytest.raises(KeyError): - df['foobar'] - - def test_series_getitem( - self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] - - result = s[2000, 3] - - # TODO(wesm): unused? - # result2 = s.loc[2000, 3] - - expected = s.reindex(s.index[42:65]) - expected.index = expected.index.droplevel(0).droplevel(0) - tm.assert_series_equal(result, expected) - - result = s[2000, 3, 10] - expected = s[49] - assert result == expected - - # fancy - expected = s.reindex(s.index[49:51]) - result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] - tm.assert_series_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] - tm.assert_series_equal(result, expected) - - # key error - pytest.raises(KeyError, s.__getitem__, (2000, 3, 4)) - - def test_series_getitem_corner( - self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] - - # don't segfault, GH #495 - # out of bounds access - pytest.raises(IndexError, s.__getitem__, len(ymd)) - - # generator - result = s[(x > 0 for x in s)] - expected = s[s > 0] - tm.assert_series_equal(result, expected) - - def test_series_setitem( - self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] - - s[2000, 3] = np.nan - assert isna(s.values[42:65]).all() - assert notna(s.values[:42]).all() - assert notna(s.values[65:]).all() - - s[2000, 3, 10] = np.nan - assert isna(s[49]) - - def test_series_slice_partial(self): - pass - - def test_frame_getitem_setitem_boolean( - self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.T.copy() - values = df.values - - result = df[df > 0] - expected = df.where(df > 0) - tm.assert_frame_equal(result, expected) - - df[df > 0] = 5 - values[values > 0] = 5 - tm.assert_almost_equal(df.values, values) - - df[df == 5] = 0 - values[values == 5] = 0 - tm.assert_almost_equal(df.values, values) - - # a df that needs alignment first - df[df[:-1] < 0] = 2 - np.putmask(values[:-1], values[:-1] < 0, 2) - tm.assert_almost_equal(df.values, values) - - with pytest.raises(TypeError, match='boolean values only'): - df[df * 0] = 2 - - def test_frame_getitem_setitem_slice( - self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - # getitem - result = frame.iloc[:4] - expected = frame[:4] - tm.assert_frame_equal(result, expected) - - # setitem - cp = frame.copy() - cp.iloc[:4] = 0 - - assert (cp.values[:4] == 0).all() - assert (cp.values[4:] != 0).all() - - def test_frame_getitem_setitem_multislice(self): - levels = [['t1', 't2'], ['a', 'b', 'c']] - codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id']) - df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) - - result = df.loc[:, 'value'] - tm.assert_series_equal(df['value'], result) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = df.ix[:, 'value'] - tm.assert_series_equal(df['value'], result) - - result = df.loc[df.index[1:3], 'value'] - tm.assert_series_equal(df['value'][1:3], result) - - result = df.loc[:, :] - tm.assert_frame_equal(df, result) - - result = df - df.loc[:, 'value'] = 10 - result['value'] = 10 - tm.assert_frame_equal(df, result) - - df.loc[:, :] = 10 - tm.assert_frame_equal(df, result) - - def test_frame_getitem_multicolumn_empty_level(self): - f = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) - f.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], - ['level3 item1', 'level3 item2']] - - result = f['level1 item1'] - expected = DataFrame([['1'], ['2'], ['3']], index=f.index, - columns=['level3 item1']) - tm.assert_frame_equal(result, expected) - - def test_frame_setitem_multi_column(self): - df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], - [0, 1, 0, 1]]) - - cp = df.copy() - cp['a'] = cp['b'] - tm.assert_frame_equal(cp['a'], cp['b']) - - # set with ndarray - cp = df.copy() - cp['a'] = cp['b'].values - tm.assert_frame_equal(cp['a'], cp['b']) - - # --------------------------------------- - # #1803 - columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) - df = DataFrame(index=[1, 3, 5], columns=columns) - - # Works, but adds a column instead of updating the two existing ones - df['A'] = 0.0 # Doesn't work - assert (df['A'].values == 0).all() - - # it broadcasts - df['B', '1'] = [1, 2, 3] - df['A'] = df['B', '1'] - - sliced_a1 = df['A', '1'] - sliced_a2 = df['A', '2'] - sliced_b1 = df['B', '1'] - tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) - tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) - assert sliced_a1.name == ('A', '1') - assert sliced_a2.name == ('A', '2') - assert sliced_b1.name == ('B', '1') - - def test_getitem_tuple_plus_slice(self): - # GH #671 - df = DataFrame({'a': lrange(10), - 'b': lrange(10), - 'c': np.random.randn(10), - 'd': np.random.randn(10)}) - - idf = df.set_index(['a', 'b']) - - result = idf.loc[(0, 0), :] - expected = idf.loc[0, 0] - expected2 = idf.xs((0, 0)) - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - expected3 = idf.ix[0, 0] - - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected2) - tm.assert_series_equal(result, expected3) - - def test_getitem_setitem_tuple_plus_columns( - self, multiindex_year_month_day_dataframe_random_data): - # GH #1013 - ymd = multiindex_year_month_day_dataframe_random_data - df = ymd[:5] - - result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] - expected = df.loc[2000, 1, 6][['A', 'B', 'C']] - tm.assert_series_equal(result, expected) - - def test_xs(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - xs = frame.xs(('bar', 'two')) - xs2 = frame.loc[('bar', 'two')] - - tm.assert_series_equal(xs, xs2) - tm.assert_almost_equal(xs.values, frame.values[4]) - - # GH 6574 - # missing values in returned index should be preserrved - acc = [ - ('a', 'abcde', 1), - ('b', 'bbcde', 2), - ('y', 'yzcde', 25), - ('z', 'xbcde', 24), - ('z', None, 26), - ('z', 'zbcde', 25), - ('z', 'ybcde', 26), - ] - df = DataFrame(acc, - columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) - expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( - ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) - - result = df.xs('z', level='a1') - tm.assert_frame_equal(result, expected) - - def test_xs_partial(self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): - frame = multiindex_dataframe_random_data - ymd = multiindex_year_month_day_dataframe_random_data - result = frame.xs('foo') - result2 = frame.loc['foo'] - expected = frame.T['foo'].T - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, result2) - - result = ymd.xs((2000, 4)) - expected = ymd.loc[2000, 4] - tm.assert_frame_equal(result, expected) - - # ex from #1796 - index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - codes=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) - df = DataFrame(np.random.randn(8, 4), index=index, - columns=list('abcd')) - - result = df.xs(['foo', 'one']) - expected = df.loc['foo', 'one'] - tm.assert_frame_equal(result, expected) - - def test_xs_with_duplicates(self, multiindex_dataframe_random_data): - # Issue #13719 - frame = multiindex_dataframe_random_data - df_dup = concat([frame] * 2) - assert df_dup.index.is_unique is False - expected = concat([frame.xs('one', level='second')] * 2) - tm.assert_frame_equal(df_dup.xs('one', level='second'), expected) - tm.assert_frame_equal(df_dup.xs(['one'], level=['second']), expected) - - def test_xs_level(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - result = frame.xs('two', level='second') - expected = frame[frame.index.get_level_values(1) == 'two'] - expected.index = expected.index.droplevel(1) - - tm.assert_frame_equal(result, expected) - - index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( - 'p', 'q', 'r')]) - df = DataFrame(np.random.randn(3, 5), index=index) - result = df.xs('c', level=2) - expected = df[1:2] - expected.index = expected.index.droplevel(2) - tm.assert_frame_equal(result, expected) - - # this is a copy in 0.14 - result = frame.xs('two', level='second') - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - - pytest.raises(com.SettingWithCopyError, f, result) - - def test_xs_level_multiple(self): - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - df = read_csv(StringIO(text), sep=r'\s+', engine='python') - - result = df.xs(('a', 4), level=['one', 'four']) - expected = df.xs('a').xs(4, level='four') - tm.assert_frame_equal(result, expected) - - # this is a copy in 0.14 - result = df.xs(('a', 4), level=['one', 'four']) - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - - pytest.raises(com.SettingWithCopyError, f, result) - - # GH2107 - dates = lrange(20111201, 20111205) - ids = 'abcde' - idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) - idx.names = ['date', 'secid'] - df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) - - rs = df.xs(20111201, level='date') - xp = df.loc[20111201, :] - tm.assert_frame_equal(rs, xp) - - def test_xs_level0(self): - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - df = read_csv(StringIO(text), sep=r'\s+', engine='python') - - result = df.xs('a', level=0) - expected = df.xs('a') - assert len(result) == 2 - tm.assert_frame_equal(result, expected) - - def test_xs_level_series(self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): - frame = multiindex_dataframe_random_data - ymd = multiindex_year_month_day_dataframe_random_data - s = frame['A'] - result = s[:, 'two'] - expected = frame.xs('two', level=1)['A'] - tm.assert_series_equal(result, expected) - - s = ymd['A'] - result = s[2000, 5] - expected = ymd.loc[2000, 5]['A'] - tm.assert_series_equal(result, expected) - - # not implementing this for now - - pytest.raises(TypeError, s.__getitem__, (2000, slice(3, 4))) - - # result = s[2000, 3:4] - # lv =s.index.get_level_values(1) - # expected = s[(lv == 3) | (lv == 4)] - # expected.index = expected.index.droplevel(0) - # tm.assert_series_equal(result, expected) - - # can do this though - - def test_get_loc_single_level(self, single_level_multiindex): - single_level = single_level_multiindex - s = Series(np.random.randn(len(single_level)), - index=single_level) - for k in single_level.values: - s[k] - - def test_getitem_toplevel(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.T - - result = df['foo'] - expected = df.reindex(columns=df.columns[:3]) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - - result = df['bar'] - result2 = df.loc[:, 'bar'] - - expected = df.reindex(columns=df.columns[3:5]) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, result2) - - def test_getitem_setitem_slice_integers(self): - index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) - - frame = DataFrame(np.random.randn(len(index), 4), index=index, - columns=['a', 'b', 'c', 'd']) - res = frame.loc[1:2] - exp = frame.reindex(frame.index[2:]) - tm.assert_frame_equal(res, exp) - - frame.loc[1:2] = 7 - assert (frame.loc[1:2] == 7).values.all() - - series = Series(np.random.randn(len(index)), index=index) - - res = series.loc[1:2] - exp = series.reindex(series.index[2:]) - tm.assert_series_equal(res, exp) - - series.loc[1:2] = 7 - assert (series.loc[1:2] == 7).values.all() - - def test_getitem_int(self, multiindex_dataframe_random_data): - levels = [[0, 1], [0, 1, 2]] - codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] - index = MultiIndex(levels=levels, codes=codes) - - frame = DataFrame(np.random.randn(6, 2), index=index) - - result = frame.loc[1] - expected = frame[-3:] - expected.index = expected.index.droplevel(0) - tm.assert_frame_equal(result, expected) - - # raises exception - pytest.raises(KeyError, frame.loc.__getitem__, 3) - - # however this will work - frame = multiindex_dataframe_random_data - result = frame.iloc[2] - expected = frame.xs(frame.index[2]) - tm.assert_series_equal(result, expected) - - def test_getitem_partial( - self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - ymd = ymd.T - result = ymd[2000, 2] - - expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1]) - expected.columns = expected.columns.droplevel(0).droplevel(0) - tm.assert_frame_equal(result, expected) - - def test_setitem_change_dtype(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - dft = frame.T - s = dft['foo', 'two'] - dft['foo', 'two'] = s > s.median() - tm.assert_series_equal(dft['foo', 'two'], s > s.median()) - # assert isinstance(dft._data.blocks[1].items, MultiIndex) - - reindexed = dft.reindex(columns=[('foo', 'two')]) - tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) - - def test_frame_setitem_ix(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - frame.loc[('bar', 'two'), 'B'] = 5 - assert frame.loc[('bar', 'two'), 'B'] == 5 - - # with integer labels - df = frame.copy() - df.columns = lrange(3) - df.loc[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - df = frame.copy() - df.columns = lrange(3) - df.ix[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 - - def test_fancy_slice_partial( - self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): - frame = multiindex_dataframe_random_data - result = frame.loc['bar':'baz'] - expected = frame[3:7] - tm.assert_frame_equal(result, expected) - - ymd = multiindex_year_month_day_dataframe_random_data - result = ymd.loc[(2000, 2):(2000, 4)] - lev = ymd.index.codes[1] - expected = ymd[(lev >= 1) & (lev <= 3)] - tm.assert_frame_equal(result, expected) - - def test_getitem_partial_column_select(self): - idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], - levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) - df = DataFrame(np.random.rand(3, 2), index=idx) - - result = df.loc[('a', 'y'), :] - expected = df.loc[('a', 'y')] - tm.assert_frame_equal(result, expected) - - result = df.loc[('a', 'y'), [1, 0]] - expected = df.loc[('a', 'y')][[1, 0]] - tm.assert_frame_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = df.ix[('a', 'y'), [1, 0]] - tm.assert_frame_equal(result, expected) - - pytest.raises(KeyError, df.loc.__getitem__, - (('a', 'foo'), slice(None, None))) - - def test_frame_getitem_view(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.T.copy() - - # this works because we are modifying the underlying array - # really a no-no - df['foo'].values[:] = 0 - assert (df['foo'].values == 0).all() - - # but not if it's mixed-type - df['foo', 'four'] = 'foo' - df = df.sort_index(level=0, axis=1) - - # this will work, but will raise/warn as its chained assignment - def f(): - df['foo']['one'] = 2 - return df - - pytest.raises(com.SettingWithCopyError, f) - - try: - df = f() - except ValueError: - pass - assert (df['foo', 'one'] == 0).all() - - def test_partial_set( - self, multiindex_year_month_day_dataframe_random_data): - # GH #397 - ymd = multiindex_year_month_day_dataframe_random_data - df = ymd.copy() - exp = ymd.copy() - df.loc[2000, 4] = 0 - exp.loc[2000, 4].values[:] = 0 - tm.assert_frame_equal(df, exp) - - df['A'].loc[2000, 4] = 1 - exp['A'].loc[2000, 4].values[:] = 1 - tm.assert_frame_equal(df, exp) - - df.loc[2000] = 5 - exp.loc[2000].values[:] = 5 - tm.assert_frame_equal(df, exp) - - # this works...for now - df['A'].iloc[14] = 5 - assert df['A'][14] == 5 - - def test_getitem_lowerdim_corner(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - pytest.raises(KeyError, frame.loc.__getitem__, - (('bar', 'three'), 'B')) - - # in theory should be inserting in a sorted space???? - frame.loc[('bar', 'three'), 'B'] = 0 - assert frame.sort_index().loc[('bar', 'three'), 'B'] == 0 - - # --------------------------------------------------------------------- - # AMBIGUOUS CASES! - - def test_partial_ix_missing( - self, multiindex_year_month_day_dataframe_random_data): - pytest.skip("skipping for now") - - ymd = multiindex_year_month_day_dataframe_random_data - result = ymd.loc[2000, 0] - expected = ymd.loc[2000]['A'] - tm.assert_series_equal(result, expected) - - # need to put in some work here - - # self.ymd.loc[2000, 0] = 0 - # assert (self.ymd.loc[2000]['A'] == 0).all() - - # Pretty sure the second (and maybe even the first) is already wrong. - pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6)) - pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6), 0) - - # --------------------------------------------------------------------- - - def test_int_series_slicing( - self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] - result = s[5:] - expected = s.reindex(s.index[5:]) - tm.assert_series_equal(result, expected) - - exp = ymd['A'].copy() - s[5:] = 0 - exp.values[5:] = 0 - tm.assert_numpy_array_equal(s.values, exp.values) - - result = ymd[5:] - expected = ymd.reindex(s.index[5:]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize('unicode_strings', [True, False]) - def test_mixed_depth_get(self, unicode_strings): - # If unicode_strings is True, the column labels in dataframe - # construction will use unicode strings in Python 2 (pull request - # #17099). - - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] - - if unicode_strings: - arrays = [[u(s) for s in arr] for arr in arrays] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(np.random.randn(4, 6), columns=index) - - result = df['a'] - expected = df['a', '', ''].rename('a') - tm.assert_series_equal(result, expected) - - result = df['routine1', 'result1'] - expected = df['routine1', 'result1', ''] - expected = expected.rename(('routine1', 'result1')) - tm.assert_series_equal(result, expected) - - def test_mixed_depth_insert(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - result = df.copy() - expected = df.copy() - result['b'] = [1, 2, 3, 4] - expected['b', '', ''] = [1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - - def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - expected = frame.copy() - result = frame.copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_frame_equal(result, expected) - - expected = frame.copy() - result = frame.copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_frame_equal(result, expected) - - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_series_equal(result, expected) - - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_series_equal(result, expected) - - def test_dataframe_insert_column_all_na(self): - # GH #1534 - mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') - ]) - df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) - s = Series({(1, 1): 1, (1, 2): 2}) - df['new'] = s - assert df['new'].isna().all() - - def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - subset = frame.index[[1, 4, 5]] - - frame.loc[subset] = 99 - assert (frame.loc[subset].values == 99).all() - - col = frame['B'] - col[subset] = 97 - assert (frame.loc[subset, 'B'] == 97).all() - - def test_indexing_ambiguity_bug_1678(self): - columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), ( - 'Colorado', 'Green')]) - index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2) - ]) - - frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, - columns=columns) - - result = frame.iloc[:, 1] - exp = frame.loc[:, ('Ohio', 'Red')] - assert isinstance(result, Series) - tm.assert_series_equal(result, exp) - - def test_nonunique_assignment_1750(self): - df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], - columns=list("ABCD")) - - df = df.set_index(['A', 'B']) - ix = MultiIndex.from_tuples([(1, 1)]) - - df.loc[ix, "C"] = '_' - - assert (df.xs((1, 1))['C'] == '_').all() - - def test_indexing_over_hashtable_size_cutoff(self): - n = 10000 - - old_cutoff = _index._SIZE_CUTOFF - _index._SIZE_CUTOFF = 20000 - - s = Series(np.arange(n), - MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - - # hai it works! - assert s[("a", 5)] == 5 - assert s[("a", 6)] == 6 - assert s[("a", 7)] == 7 - - _index._SIZE_CUTOFF = old_cutoff - - def test_iloc_mi(self): - # GH 13797 - # Test if iloc can handle integer locations in MultiIndexed DataFrame - - data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], - ['str30', 'str31'], ['str40', 'str41']] - - mi = MultiIndex.from_tuples( - [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) - - expected = DataFrame(data) - df_mi = DataFrame(data, index=mi) - - result = DataFrame([[df_mi.iloc[r, c] for c in range(2)] - for r in range(5)]) - - tm.assert_frame_equal(result, expected) - - def test_getitem_multilevel_index_tuple_not_sorted(self): - index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) - df = df.set_index(index_columns) - query_index = df.index[:1] - rs = df.loc[query_index, "data"] - - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') - tm.assert_series_equal(rs, xp) - - def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.sort_index(level=1).T - - # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] - expected = df.reindex(columns=df.columns[:3]) - tm.assert_frame_equal(result, expected) - - def test_frame_getitem_not_sorted2(self): - # 13431 - df = DataFrame({'col1': ['b', 'd', 'b', 'a'], - 'col2': [3, 1, 1, 2], - 'data': ['one', 'two', 'three', 'four']}) - - df2 = df.set_index(['col1', 'col2']) - df2_original = df2.copy() - - df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True) - assert not df2.index.is_lexsorted() - assert not df2.index.is_monotonic - - assert df2_original.index.equals(df2.index) - expected = df2.sort_index() - assert expected.index.is_lexsorted() - assert expected.index.is_monotonic - - result = df2.sort_index(level=0) - assert result.index.is_lexsorted() - assert result.index.is_monotonic - tm.assert_frame_equal(result, expected) - - def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - df = frame.T - df['foo', 'four'] = 'foo' - - arrays = [np.array(x) for x in zip(*df.columns.values)] - - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) - expected.index = expected.index.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - tuples = lzip(*arrays) - index = MultiIndex.from_tuples(tuples) - s = Series(randn(8), index=index) - - arrays = [np.array(x) for x in zip(*index.values)] - - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] - expected.index = expected.index.droplevel(0) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - -class TestMultiIndexSlicers(object): - - def test_per_axis_per_level_getitem(self): - - # GH6134 - # example test case - ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( - 'C', 4), _mklbl('D', 2)]) - df = DataFrame(np.arange(len(ix.get_values())), index=ix) - - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C2' or c == 'C3')]] - result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] - tm.assert_frame_equal(result, expected) - - # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - - df = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) - df = df.sort_index(axis=0).sort_index(axis=1) - - # identity - result = df.loc[(slice(None), slice(None)), :] - tm.assert_frame_equal(result, df) - result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] - tm.assert_frame_equal(result, df) - result = df.loc[:, (slice(None), slice(None))] - tm.assert_frame_equal(result, df) - - # index - result = df.loc[(slice(None), [1]), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), 1), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - # columns - result = df.loc[:, (slice(None), ['foo'])] - expected = df.iloc[:, [1, 3]] - tm.assert_frame_equal(result, expected) - - # both - result = df.loc[(slice(None), 1), (slice(None), ['foo'])] - expected = df.iloc[[0, 3], [1, 3]] - tm.assert_frame_equal(result, expected) - - result = df.loc['A', 'a'] - expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), - index=Index([1, 2, 3], name='two'), - columns=Index(['bar', 'foo'], name='lvl1')) - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), [1, 2]), :] - expected = df.iloc[[0, 1, 3]] - tm.assert_frame_equal(result, expected) - - # multi-level series - s = Series(np.arange(len(ix.get_values())), index=ix) - result = s.loc['A1':'A3', :, ['C1', 'C3']] - expected = s.loc[[tuple([a, b, c, d]) - for a, b, c, d in s.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_series_equal(result, expected) - - # boolean indexers - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] - expected = df.iloc[[2, 3]] - tm.assert_frame_equal(result, expected) - - def f(): - df.loc[(slice(None), np.array([True, False])), :] - - pytest.raises(ValueError, f) - - # ambiguous cases - # these can be multiply interpreted (e.g. in this case - # as df.loc[slice(None),[1]] as well - pytest.raises(KeyError, lambda: df.loc[slice(None), [1]]) - - result = df.loc[(slice(None), [1]), :] - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - # not lexsorted - assert df.index.lexsort_depth == 2 - df = df.sort_index(level=1, axis=0) - assert df.index.lexsort_depth == 0 - - msg = ('MultiIndex slicing requires the index to be ' - r'lexsorted: slicing on levels \[1\], lexsort depth 0') - with pytest.raises(UnsortedIndexError, match=msg): - df.loc[(slice(None), slice('bar')), :] - - # GH 16734: not sorted, but no real slicing - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] - tm.assert_frame_equal(result, df.iloc[[1, 3], :]) - - def test_multiindex_slicers_non_unique(self): - - # GH 7106 - # non-unique mi index support - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 3], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) - assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) - result = df.loc[(slice(None), slice(None), 1), :] - tm.assert_frame_equal(result, expected) - - # this is equivalent of an xs expression - result = df.xs(1, level=2, drop_level=False) - tm.assert_frame_equal(result, expected) - - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 2], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) - assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) - result = df.loc[(slice(None), slice(None), 1), :] - assert not result.index.is_unique - tm.assert_frame_equal(result, expected) - - # GH12896 - # numpy-implementation dependent bug - ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, - 17, 18, 19, 200000, 200000] - n = len(ints) - idx = MultiIndex.from_arrays([['a'] * n, ints]) - result = Series([1] * n, index=idx) - result = result.sort_index() - result = result.loc[(slice(None), slice(100000))] - expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() - tm.assert_series_equal(result, expected) - - def test_multiindex_slicers_datetimelike(self): - - # GH 7429 - # buggy/inconsistent behavior when slicing with datetime-like - import datetime - dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + - datetime.timedelta(days=i) for i in range(6)] - freq = [1, 2] - index = MultiIndex.from_product( - [dates, freq], names=['date', 'frequency']) - - df = DataFrame( - np.arange(6 * 2 * 4, dtype='int64').reshape( - -1, 4), index=index, columns=list('ABCD')) - - # multi-axis slicing - idx = pd.IndexSlice - expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( - '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), 1), - slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - # with strings - result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), - idx['A', 'B']] - tm.assert_frame_equal(result, expected) - - def test_multiindex_slicers_edges(self): - # GH 8132 - # various edge cases - df = DataFrame( - {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, - 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, - 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", - "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", - "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", - "2013-07-09", "2013-08-06", "2013-09-03"], - 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) - - df['DATE'] = pd.to_datetime(df['DATE']) - df1 = df.set_index(['A', 'B', 'DATE']) - df1 = df1.sort_index() - - # A1 - Get all values under "A0" and "A1" - result = df1.loc[(slice('A1')), :] - expected = df1.iloc[0:10] - tm.assert_frame_equal(result, expected) - - # A2 - Get all values from the start to "A2" - result = df1.loc[(slice('A2')), :] - expected = df1 - tm.assert_frame_equal(result, expected) - - # A3 - Get all values under "B1" or "B2" - result = df1.loc[(slice(None), slice('B1', 'B2')), :] - expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] - tm.assert_frame_equal(result, expected) - - # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), - slice('20130702', '20130709')), :] - expected = df1.iloc[[1, 2, 6, 7, 12]] - tm.assert_frame_equal(result, expected) - - # B1 - Get all values in B0 that are also under A0, A1 and A2 - result = df1.loc[(slice('A2'), slice('B0')), :] - expected = df1.iloc[[0, 1, 5, 6, 10, 11]] - tm.assert_frame_equal(result, expected) - - # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for - # the As) - result = df1.loc[(slice(None), slice('B2')), :] - expected = df1 - tm.assert_frame_equal(result, expected) - - # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), - slice('2013-08-06')), :] - expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] - tm.assert_frame_equal(result, expected) - - # B4 - Same as A4 but the start of the date slice is not a key. - # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), - slice('20130701', '20130709')), :] - expected = df1.iloc[[1, 2, 6, 7, 12]] - tm.assert_frame_equal(result, expected) - - def test_per_axis_per_level_doc_examples(self): - - # test index maker - idx = pd.IndexSlice - - # from indexing.rst / advanced - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, columns=columns) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx[:, :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - # not sorted - def f(): - df.loc['A1', ('a', slice('foo'))] - - pytest.raises(UnsortedIndexError, f) - - # GH 16734: not sorted, but no real slicing - tm.assert_frame_equal(df.loc['A1', (slice(None), 'foo')], - df.loc['A1'].iloc[:, [0, 2]]) - - df = df.sort_index(axis=1) - - # slicing - df.loc['A1', (slice(None), 'foo')] - df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] - - # setitem - df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 - - def test_loc_axis_arguments(self): - - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, - columns=columns).sort_index().sort_index(axis=1) - - # axis 0 - result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='index')[:, :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - # axis 1 - result = df.loc(axis=1)[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='columns')[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] - tm.assert_frame_equal(result, expected) - - # invalid axis - def f(): - df.loc(axis=-1)[:, :, ['C1', 'C3']] - - pytest.raises(ValueError, f) - - def f(): - df.loc(axis=2)[:, :, ['C1', 'C3']] - - pytest.raises(ValueError, f) - - def f(): - df.loc(axis='foo')[:, :, ['C1', 'C3']] - - pytest.raises(ValueError, f) - - def test_per_axis_per_level_setitem(self): - - # test index maker - idx = pd.IndexSlice - - # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - - df_orig = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) - df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) - - # identity - df = df_orig.copy() - df.loc[(slice(None), slice(None)), :] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc(axis=0)[:, :] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[:, (slice(None), slice(None))] = 100 - expected = df_orig.copy() - expected.iloc[:, :] = 100 - tm.assert_frame_equal(df, expected) - - # index - df = df_orig.copy() - df.loc[(slice(None), [1]), :] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), 1), :] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc(axis=0)[:, 1] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # columns - df = df_orig.copy() - df.loc[:, (slice(None), ['foo'])] = 100 - expected = df_orig.copy() - expected.iloc[:, [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # both - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[idx[:, 1], idx[:, ['foo']]] = 100 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc['A', 'a'] = 100 - expected = df_orig.copy() - expected.iloc[0:3, 0:2] = 100 - tm.assert_frame_equal(df, expected) - - # setting with a list-like - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100, 100], [100, 100]], dtype='int64') - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = 100 - tm.assert_frame_equal(df, expected) - - # not enough values - df = df_orig.copy() - - def f(): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100], [100, 100]], dtype='int64') - - pytest.raises(ValueError, f) - - def f(): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [100, 100, 100, 100], dtype='int64') - - pytest.raises(ValueError, f) - - # with an alignable rhs - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( - None), 1), (slice(None), ['foo'])] * 5 - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 - tm.assert_frame_equal(df, expected) - - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( - None), 1), (slice(None), ['foo'])] - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - tm.assert_frame_equal(df, expected) - - rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() - rhs.loc[:, ('c', 'bah')] = 10 - df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs - expected = df_orig.copy() - expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - tm.assert_frame_equal(df, expected) - - -@pytest.mark.filterwarnings('ignore:\\nPanel:FutureWarning') -class TestMultiIndexPanel(object): - - def test_iloc_getitem_panel_multiindex(self): - - # GH 7199 - # Panel with multi-index - multi_index = MultiIndex.from_tuples([('ONE', 'one'), - ('TWO', 'two'), - ('THREE', 'three')], - names=['UPPER', 'lower']) - - simple_index = [x[0] for x in multi_index] - wd1 = Panel(items=['First', 'Second'], - major_axis=['a', 'b', 'c', 'd'], - minor_axis=multi_index) - - wd2 = Panel(items=['First', 'Second'], - major_axis=['a', 'b', 'c', 'd'], - minor_axis=simple_index) - - expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] - result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG - tm.assert_frame_equal(result1, expected1) - - expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] - result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] - tm.assert_frame_equal(result2, expected2) - - expected1 = DataFrame(index=['a'], columns=multi_index, - dtype='float64') - result1 = wd1.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result1, expected1) - - expected2 = DataFrame(index=['a'], columns=simple_index, - dtype='float64') - result2 = wd2.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result2, expected2) - - # GH 7516 - mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) - p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), - items=['a', 'b', 'c'], major_axis=mi, - minor_axis=['u', 'v', 'w']) - result = p.iloc[:, 1, 0] - expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') - tm.assert_series_equal(result, expected) - - result = p.loc[:, (1, 'y'), 'u'] - tm.assert_series_equal(result, expected) - - def test_panel_setitem_with_multiindex(self): - - # 10360 - # failing with a multi-index - arr = np.array([[[1, 2, 3], [0, 0, 0]], - [[0, 0, 0], [0, 0, 0]]], - dtype=np.float64) - - # reg index - axes = dict(items=['A', 'B'], major_axis=[0, 1], - minor_axis=['X', 'Y', 'Z']) - p1 = Panel(0., **axes) - p1.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p1, expected) - - # multi-indexes - axes['items'] = MultiIndex.from_tuples( - [('A', 'a'), ('B', 'b')]) - p2 = Panel(0., **axes) - p2.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p2, expected) - - axes['major_axis'] = MultiIndex.from_tuples( - [('A', 1), ('A', 2)]) - p3 = Panel(0., **axes) - p3.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p3, expected) - - axes['minor_axis'] = MultiIndex.from_product( - [['X'], range(3)]) - p4 = Panel(0., **axes) - p4.iloc[0, 0, :] = [1, 2, 3] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p4, expected) - - arr = np.array( - [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]], - dtype=np.float64) - p5 = Panel(0., **axes) - p5.iloc[0, :, 0] = [1, 2] - expected = Panel(arr, **axes) - tm.assert_panel_equal(p5, expected) - - -def test_multiindex_period_datetime(): - # GH4861, using datetime in period of multiindex raises exception - - idx1 = Index(['a', 'a', 'a', 'b', 'b']) - idx2 = period_range('2012-01', periods=len(idx1), freq='M') - s = Series(np.random.randn(len(idx1)), [idx1, idx2]) - - # try Period as index - expected = s.iloc[0] - result = s.loc['a', Period('2012-01')] - assert result == expected - - # try datetime as index - result = s.loc['a', datetime(2012, 1, 1)] - assert result == expected diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 84bc1863aadd9..d36de931e2610 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -150,7 +150,7 @@ def test_multiindex_unique(): def test_multiindex_objects(): mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], - labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], names=["col1", "col2"]) recons = mi._sort_levels_monotonic() From d84188d8edc711ae3d9e7a8d8050de2ee0a6ecbd Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 2 Dec 2018 22:25:22 +0000 Subject: [PATCH 11/12] minor changes --- doc/source/advanced.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/groupby/generic.py | 6 +++--- pandas/core/indexes/multi.py | 4 ++++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index e16b2652ab7d4..39082ef7a4c69 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -474,7 +474,7 @@ values across a level. For instance: .. ipython:: python midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], - codes=[[1, 1, 0, 0],[1, 0, 1, 0]]) + codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 67e65c185b33e..3bfae789f0478 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1101,7 +1101,7 @@ Deprecations ~~~~~~~~~~~~ - :attr:`MultiIndex.labels` has been deprecated and replaced by :attr:`MultiIndex.codes`. - The functionality is unchanged. This new name better reflects the natures of + The functionality is unchanged. The new name better reflects the natures of these codes and makes the ``MultiIndex`` API more similar to the API for :class:`CategoricalIndex`(:issue:`13443`). As a consequence, other uses of the name ``labels`` in ``MultiIndex`` have also been deprecated and replaced with ``codes``: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 95bf50fc4ca4d..26e437355fa8b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1191,10 +1191,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, left[-1] = out[sorter], left[-1][sorter] # build the multi-index w/ full levels - labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) - labels.append(left[-1]) + codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=labels, names=names, + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) if is_integer_dtype(out): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c779281cfcf16..5e26a3c6c439e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -633,6 +633,10 @@ def set_codes(self, codes, level=None, inplace=False, Set new codes on MultiIndex. Defaults to returning new index. + .. versionadded:: 0.24.0 + + New name for deprecated method `set_labels`. + Parameters ---------- codes : sequence or list of sequence From a8d00ad61651cfcecf6f9da6a1b23ee232531e8f Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 5 Dec 2018 17:44:45 +0000 Subject: [PATCH 12/12] small updates according to comments --- doc/source/whatsnew/v0.24.0.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3bfae789f0478..090127f50c6c2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1102,12 +1102,11 @@ Deprecations - :attr:`MultiIndex.labels` has been deprecated and replaced by :attr:`MultiIndex.codes`. The functionality is unchanged. The new name better reflects the natures of - these codes and makes the ``MultiIndex`` API more similar to the API for - :class:`CategoricalIndex`(:issue:`13443`). + these codes and makes the ``MultiIndex`` API more similar to the API for :class:`CategoricalIndex`(:issue:`13443`). As a consequence, other uses of the name ``labels`` in ``MultiIndex`` have also been deprecated and replaced with ``codes``: - You should initialize a ``MultiIndex`` instance using a parameter named ``codes`` rather than ``labels``. - - :meth:`MultiIndex.set_labels` has been deprecated in favor of :meth:`MultiIndex.set_codes` - - for method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. + - ``MultiIndex.set_labels`` has been deprecated in favor of :meth:`MultiIndex.set_codes`. + - For method :meth:`MultiIndex.copy`, the ``labels`` parameter has been deprecated and replaced by a ``codes`` parameter. - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`)