diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py new file mode 100644 index 0000000000000..47d53ef6f3619 --- /dev/null +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -0,0 +1,16 @@ +import pytest + +from pandas import DataFrame, MultiIndex + + +class TestGetitem: + def test_getitem_unused_level_raises(self): + # GH#20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 25e6142476d65..ddca67306d804 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1233,17 +1233,17 @@ def test_sum_timedelta64_skipna_false(): arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" - df = pd.DataFrame(arr) + df = DataFrame(arr) result = df.sum(skipna=False) - expected = pd.Series([pd.Timedelta(seconds=12), pd.NaT]) + expected = Series([pd.Timedelta(seconds=12), pd.NaT]) tm.assert_series_equal(result, expected) result = df.sum(axis=0, skipna=False) tm.assert_series_equal(result, expected) result = df.sum(axis=1, skipna=False) - expected = pd.Series( + expected = Series( [ pd.Timedelta(seconds=1), pd.Timedelta(seconds=5), diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 2438c743f3b8a..a2e1f2398e711 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -222,6 +222,31 @@ def test_suppress_future_warning_with_sort_kw(sort_kw): class TestDataFrameJoin: + def test_join(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + a = frame.loc[frame.index[:5], ["A"]] + b = frame.loc[frame.index[2:], ["B", "C"]] + + joined = a.join(b, how="outer").reindex(frame.index) + expected = frame.copy() + expected.values[np.isnan(joined.values)] = np.nan + + assert not np.isnan(joined.values).all() + + # TODO what should join do with names ? + tm.assert_frame_equal(joined, expected, check_names=False) + + def test_join_segfault(self): + # GH#1532 + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) + # it works! + for how in ["left", "right", "outer"]: + df1.join(df2, how=how) + def test_join_str_datetime(self): str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 70898367f6a40..9820b39e20651 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -14,8 +14,6 @@ from pandas.compat.numpy import np_datetime64_compat from pandas.util._test_decorators import async_mark -from pandas.core.dtypes.generic import ABCIndex - import pandas as pd from pandas import ( CategoricalIndex, @@ -2518,10 +2516,6 @@ def test_ensure_index_mixed_closed_intervals(self): ], ) def test_generated_op_names(opname, index): - if isinstance(index, ABCIndex) and opname == "rsub": - # Index.__rsub__ does not exist; though the method does exist - # for subclasses. see GH#19723 - return opname = f"__{opname}__" method = getattr(index, opname) assert method.__name__ == opname diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5c5692b777360..3b915f13c7568 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -8,7 +8,7 @@ from pandas.compat.numpy import is_numpy_dev import pandas as pd -from pandas import DataFrame, Series, Timestamp, date_range +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas._testing as tm from pandas.api.types import is_scalar from pandas.tests.indexing.common import Base @@ -979,6 +979,47 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) +class TestLocWithMultiIndex: + @pytest.mark.parametrize( + "keys, expected", + [ + (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), + (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), + ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), + ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), + ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), + ], + ) + @pytest.mark.parametrize("dim", ["index", "columns"]) + def test_loc_getitem_multilevel_index_order(self, dim, keys, expected): + # GH#22797 + # Try to respect order of keys given for MultiIndex.loc + kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} + df = DataFrame(np.arange(25).reshape(5, 5), **kwargs) + exp_index = MultiIndex.from_arrays(expected) + if dim == "index": + res = df.loc[keys, :] + tm.assert_index_equal(res.index, exp_index) + elif dim == "columns": + res = df.loc[:, keys] + tm.assert_index_equal(res.columns, exp_index) + + def test_loc_preserve_names(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + result = ymd.loc[2000] + result2 = ymd["A"].loc[2000] + assert result.index.names == ymd.index.names[1:] + assert result2.index.names == ymd.index.names[1:] + + result = ymd.loc[2000, 2] + result2 = ymd["A"].loc[2000, 2] + assert result.index.name == ymd.index.names[2] + assert result2.index.name == ymd.index.names[2] + + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 key = np.array( diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index f5781fff15932..f4f963d268aeb 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -822,6 +822,14 @@ def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): assert result == expected +def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + + ymd.columns.name = "foo" + ymd.to_html() + ymd.T.to_html() + + @pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) def test_to_html_na_rep_and_float_format(na_rep): # https://github.com/pandas-dev/pandas/issues/13828 diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index a37349654b120..afc271264edbf 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -531,3 +531,20 @@ def test_pickle_binary_object_compression(compression): read_df = pd.read_pickle(buffer, compression=compression) buffer.seek(0) tm.assert_frame_equal(df, read_df) + + +def test_pickle_dataframe_with_multilevel_index( + multiindex_year_month_day_dataframe_random_data, + multiindex_dataframe_random_data, +): + ymd = multiindex_year_month_day_dataframe_random_data + frame = multiindex_dataframe_random_data + + def _test_roundtrip(frame): + unpickled = tm.round_trip_pickle(frame) + tm.assert_frame_equal(frame, unpickled) + + _test_roundtrip(frame) + _test_roundtrip(frame.T) + _test_roundtrip(ymd) + _test_roundtrip(ymd.T) diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index df1babbee8e18..7fff87c7b55f4 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -7,6 +7,26 @@ class TestSeriesCount: + def test_count_level_series(self): + index = MultiIndex( + levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], + ) + + ser = Series(np.random.randn(len(index)), index=index) + + result = ser.count(level=0) + expected = ser.groupby(level=0).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + + result = ser.count(level=1) + expected = ser.groupby(level=1).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + def test_count_multiindex(self, series_with_multilevel_index): ser = series_with_multilevel_index diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 28d29c69f6526..0c946b2cbccc9 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -2,7 +2,8 @@ import pytest import pandas as pd -from pandas import Series +from pandas import MultiIndex, Series +import pandas._testing as tm def test_reductions_td64_with_nat(): @@ -46,6 +47,14 @@ def test_prod_numpy16_bug(): assert not isinstance(result, Series) +def test_sum_with_level(): + obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) + + result = obj.sum(level=0) + expected = Series([10.0], index=[2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) def test_validate_any_all_out_keepdims_raises(kwargs, func): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 6db1078fcde4f..6d4c1594146de 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -48,30 +48,37 @@ def setup_method(self, method): def teardown_method(self, method): expr._MIN_ELEMENTS = self._MIN_ELEMENTS - def run_arithmetic(self, df, other): + @staticmethod + def call_op(df, other, flex: bool, opname: str): + if flex: + op = lambda x, y: getattr(x, opname)(y) + op.__name__ = opname + else: + op = getattr(operator, opname) + + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + + expr.get_test_result() + + result = op(df, other) + return result, expected + + def run_arithmetic(self, df, other, flex: bool): expr._MIN_ELEMENTS = 0 operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] - for test_flex in [True, False]: - for arith in operations: - # TODO: share with run_binary - if test_flex: - op = lambda x, y: getattr(x, arith)(y) - op.__name__ = arith + for arith in operations: + result, expected = self.call_op(df, other, flex, arith) + + if arith == "truediv": + if expected.ndim == 1: + assert expected.dtype.kind == "f" else: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, other) - expr.set_use_numexpr(True) - - result = op(df, other) - if arith == "truediv": - if expected.ndim == 1: - assert expected.dtype.kind == "f" - else: - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) - - def run_binary(self, df, other): + assert all(x.kind == "f" for x in expected.dtypes.values) + tm.assert_equal(expected, result) + + def run_binary(self, df, other, flex: bool): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -81,37 +88,27 @@ def run_binary(self, df, other): expr.set_test_mode(True) operations = ["gt", "lt", "ge", "le", "eq", "ne"] - for test_flex in [True, False]: - for arith in operations: - if test_flex: - op = lambda x, y: getattr(x, arith)(y) - op.__name__ = arith - else: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, other) - expr.set_use_numexpr(True) - - expr.get_test_result() - result = op(df, other) - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) - - def run_frame(self, df, other, run_binary=True): - self.run_arithmetic(df, other) - if run_binary: - expr.set_use_numexpr(False) - binary_comp = other + 1 - expr.set_use_numexpr(True) - self.run_binary(df, binary_comp) + for arith in operations: + result, expected = self.call_op(df, other, flex, arith) + + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) + + def run_frame(self, df, other, flex: bool): + self.run_arithmetic(df, other, flex) + + expr.set_use_numexpr(False) + binary_comp = other + 1 + expr.set_use_numexpr(True) + self.run_binary(df, binary_comp, flex) for i in range(len(df.columns)): - self.run_arithmetic(df.iloc[:, i], other.iloc[:, i]) + self.run_arithmetic(df.iloc[:, i], other.iloc[:, i], flex) # FIXME: dont leave commented-out # series doesn't uses vec_compare instead of numexpr... # binary_comp = other.iloc[:, i] + 1 - # self.run_binary(df.iloc[:, i], binary_comp) + # self.run_binary(df.iloc[:, i], binary_comp, flex) @pytest.mark.parametrize( "df", @@ -126,14 +123,9 @@ def run_frame(self, df, other, run_binary=True): _mixed2, ], ) - def test_arithmetic(self, df): - # TODO: FIGURE OUT HOW TO GET RUN_BINARY TO WORK WITH MIXED=... - # can't do arithmetic because comparison methods try to do *entire* - # frame instead of by-column - kinds = {x.kind for x in df.dtypes.values} - should = len(kinds) == 1 - - self.run_frame(df, df, run_binary=should) + @pytest.mark.parametrize("flex", [True, False]) + def test_arithmetic(self, df, flex): + self.run_frame(df, df, flex) def test_invalid(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8b8e49d914905..901049209bf64 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -32,7 +32,7 @@ def test_append(self, multiindex_dataframe_random_data): result = a["A"].append(b["A"]) tm.assert_series_equal(result, frame["A"]) - def test_dataframe_constructor(self): + def test_dataframe_constructor_infer_multiindex(self): multi = DataFrame( np.random.randn(4, 4), index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], @@ -45,7 +45,7 @@ def test_dataframe_constructor(self): ) assert isinstance(multi.columns, MultiIndex) - def test_series_constructor(self): + def test_series_constructor_infer_multiindex(self): multi = Series( 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])] ) @@ -103,23 +103,6 @@ def _check_op(opname): _check_op("mul") _check_op("div") - def test_pickle( - self, - multiindex_year_month_day_dataframe_random_data, - multiindex_dataframe_random_data, - ): - ymd = multiindex_year_month_day_dataframe_random_data - frame = multiindex_dataframe_random_data - - def _test_roundtrip(frame): - unpickled = tm.round_trip_pickle(frame) - tm.assert_frame_equal(frame, unpickled) - - _test_roundtrip(frame) - _test_roundtrip(frame.T) - _test_roundtrip(ymd) - _test_roundtrip(ymd.T) - def test_reindex(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -146,37 +129,6 @@ def test_reindex_preserve_levels( chunk = ymdT.loc[:, new_index] assert chunk.columns is new_index - def test_count_level_series(self): - index = MultiIndex( - levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], - codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], - ) - - s = Series(np.random.randn(len(index)), index=index) - - result = s.count(level=0) - expected = s.groupby(level=0).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - result = s.count(level=1) - expected = s.groupby(level=1).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - def test_unused_level_raises(self): - # GH 20410 - mi = MultiIndex( - levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], - codes=[[1, 0], [1, 0]], - ) - df = DataFrame(-1, index=range(3), columns=mi) - - with pytest.raises(KeyError, match="notevenone"): - df["notevenone"] - def test_groupby_transform(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -219,21 +171,6 @@ def test_groupby_level_no_obs(self): result = grouped.sum() assert (result.columns == ["f2", "f3"]).all() - def test_join(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - - a = frame.loc[frame.index[:5], ["A"]] - b = frame.loc[frame.index[2:], ["B", "C"]] - - joined = a.join(b, how="outer").reindex(frame.index) - expected = frame.copy() - expected.values[np.isnan(joined.values)] = np.nan - - assert not np.isnan(joined.values).all() - - # TODO what should join do with names ? - tm.assert_frame_equal(joined, expected, check_names=False) - def test_insert_index(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data @@ -323,13 +260,6 @@ def aggf(x): tm.assert_frame_equal(leftside, rightside) - def test_stat_op_corner(self): - obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) - - result = obj.sum(level=0) - expected = Series([10.0], index=[2]) - tm.assert_series_equal(result, expected) - def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays( [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] @@ -389,26 +319,6 @@ def test_multilevel_consolidate(self): df["Totals", ""] = df.sum(1) df = df._consolidate() - def test_loc_preserve_names(self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - - result = ymd.loc[2000] - result2 = ymd["A"].loc[2000] - assert result.index.names == ymd.index.names[1:] - assert result2.index.names == ymd.index.names[1:] - - result = ymd.loc[2000, 2] - result2 = ymd["A"].loc[2000, 2] - assert result.index.name == ymd.index.names[2] - assert result2.index.name == ymd.index.names[2] - - def test_to_html(self, multiindex_year_month_day_dataframe_random_data): - ymd = multiindex_year_month_day_dataframe_random_data - - ymd.columns.name = "foo" - ymd.to_html() - ymd.T.to_html() - def test_level_with_tuples(self): index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], @@ -484,16 +394,6 @@ def test_unicode_repr_level_names(self): repr(s) repr(df) - def test_join_segfault(self): - # 1532 - df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) - df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) - df1 = df1.set_index(["a", "b"]) - df2 = df2.set_index(["a", "b"]) - # it works! - for how in ["left", "right", "outer"]: - df1.join(df2, how=how) - @pytest.mark.parametrize("d", [4, "d"]) def test_empty_frame_groupby_dtypes_consistency(self, d): # GH 20888 @@ -583,29 +483,3 @@ def test_sort_non_lexsorted(self): ) result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "keys, expected", - [ - (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), - (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), - ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), - ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), - ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), - ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), - ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), - ], - ) - @pytest.mark.parametrize("dim", ["index", "columns"]) - def test_multilevel_index_loc_order(self, dim, keys, expected): - # GH 22797 - # Try to respect order of keys given for MultiIndex.loc - kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} - df = DataFrame(np.arange(25).reshape(5, 5), **kwargs) - exp_index = MultiIndex.from_arrays(expected) - if dim == "index": - res = df.loc[keys, :] - tm.assert_index_equal(res.index, exp_index) - elif dim == "columns": - res = df.loc[:, keys] - tm.assert_index_equal(res.columns, exp_index)