diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py new file mode 100644 index 0000000000000..b595e48797d41 --- /dev/null +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -0,0 +1,388 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas.util.testing as tm + + +class TestDataFrameIndexingCategorical: + def test_assignment(self): + # assignment + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values + df["D"] = d + str(df) + + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + df["E"] = s + str(df) + + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._data._block.values, d) + + # sorting + s.name = "E" + tm.assert_series_equal(result2.sort_index(), s.sort_index()) + + cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) + df = DataFrame(Series(cat)) + + def test_assigning_ops(self): + # systematically test the assigning operations: + # for all slicing ops: + # for value in categories and value not in categories: + + # - assign a single value -> exp_single_cats_value + + # - assign a complete row (mixed values) -> exp_single_row + + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + + # the expected values + # changed single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + + # iloc + # ############### + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.iloc[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.iloc[df.index == "j", 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, 0] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.iloc[2, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.iloc[2:4, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2:4, :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.iloc[2:4, 0] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", "cats"] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", "cats"] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", df.columns[0]] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", df.columns[0]] = ["c", "c"] + + # iat + df = orig.copy() + df.iat[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iat[2, 0] = "c" + + # at + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # fancy indexing + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) + + df[df["cats"] == "c"] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + # set_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # Assigning a Category to parts of a int/... column uses the values of + # the Categorical + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp) + + def test_functions_no_warnings(self): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) + + def test_loc_indexing_preserves_index_category_dtype(self): + # GH 15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=pd.MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = pd.CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) + + def test_wrong_length_cat_dtype_raises(self): + # GH29523 + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = pd.DataFrame({"bar": range(10)}) + err = "Length of values does not match length of index" + with pytest.raises(ValueError, match=err): + df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py new file mode 100644 index 0000000000000..bde35c04acf4f --- /dev/null +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -0,0 +1,62 @@ +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, notna +import pandas.util.testing as tm + + +class TestDataFrameIndexingDatetimeWithTZ: + def test_setitem(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + assert id(b1.values._data.base) != id(b2.values._data.base) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_set_reset(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype, "M8[ns, US/Eastern" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_transpose(self, timezone_frame): + + result = timezone_frame.T + expected = DataFrame(timezone_frame.values.T) + expected.index = ["A", "B", "C"] + tm.assert_frame_equal(result, expected) + + def test_scalar_assignment(self): + # issue #19843 + df = pd.DataFrame(index=(0, 1, 2)) + df["now"] = pd.Timestamp("20130101", tz="UTC") + expected = pd.DataFrame( + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py similarity index 73% rename from pandas/tests/frame/test_indexing.py rename to pandas/tests/frame/indexing/test_indexing.py index e37f734c6235e..24a431fe42cf8 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -7,12 +7,10 @@ from pandas._libs.tslib import iNaT -from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, @@ -2695,576 +2693,6 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) - - def _safe_add(df): - # only add to the numeric items - def is_ok(s): - return ( - issubclass(s.dtype.type, (np.integer, np.floating)) - and s.dtype != "uint8" - ) - - return DataFrame( - dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) - ) - - def _check_get(df, cond, check_dtypes=True): - other1 = _safe_add(df) - rs = df.where(cond, other1) - rs2 = df.where(cond.values, other1) - for k, v in rs.items(): - exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) - tm.assert_series_equal(v, exp, check_names=False) - tm.assert_frame_equal(rs, rs2) - - # dtypes - if check_dtypes: - assert (rs.dtypes == df.dtypes).all() - - # check getting - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - cond = df > 0 - _check_get(df, cond) - - # upcasting case (GH # 2794) - df = DataFrame( - { - c: Series([1] * 3, dtype=c) - for c in ["float32", "float64", "int32", "int64"] - } - ) - df.iloc[1, :] = 0 - result = df.dtypes - expected = Series( - [ - np.dtype("float32"), - np.dtype("float64"), - np.dtype("int32"), - np.dtype("int64"), - ], - index=["float32", "float64", "int32", "int64"], - ) - - # when we don't preserve boolean casts - # - # expected = Series({ 'float32' : 1, 'float64' : 3 }) - - tm.assert_series_equal(result, expected) - - # aligning - def _check_align(df, cond, other, check_dtypes=True): - rs = df.where(cond, other) - for i, k in enumerate(rs.columns): - result = rs[k] - d = df[k].values - c = cond[k].reindex(df[k].index).fillna(False).values - - if is_scalar(other): - o = other - else: - if isinstance(other, np.ndarray): - o = Series(other[:, i], index=result.index).values - else: - o = other[k].values - - new_values = d if c.all() else np.where(c, d, o) - expected = Series(new_values, index=result.index, name=k) - - # since we can't always have the correct numpy dtype - # as numpy doesn't know how to downcast, don't check - tm.assert_series_equal(result, expected, check_dtype=False) - - # dtypes - # can't check dtype when other is an ndarray - - if check_dtypes and not isinstance(other, np.ndarray): - assert (rs.dtypes == df.dtypes).all() - - for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - # other is a frame - cond = (df > 0)[1:] - _check_align(df, cond, _safe_add(df)) - - # check other is ndarray - cond = df > 0 - _check_align(df, cond, (_safe_add(df).values)) - - # integers are upcast, so don't check the dtypes - cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) - _check_align(df, cond, np.nan, check_dtypes=check_dtypes) - - # invalid conditions - df = default_frame - err1 = (df + 1).values[0:2, :] - msg = "other must be the same shape as self when an ndarray" - with pytest.raises(ValueError, match=msg): - df.where(cond, err1) - - err2 = cond.iloc[:2, :].values - other1 = _safe_add(df) - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - df.where(err2, other1) - - with pytest.raises(ValueError, match=msg): - df.mask(True) - with pytest.raises(ValueError, match=msg): - df.mask(0) - - # where inplace - def _check_set(df, cond, check_dtypes=True): - dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) - expected = dfi.mask(~econd) - - dfi.where(cond, np.nan, inplace=True) - tm.assert_frame_equal(dfi, expected) - - # dtypes (and confirm upcasts)x - if check_dtypes: - for k, v in df.dtypes.items(): - if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype("float64") - assert dfi[k].dtype == v - - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - cond = df > 0 - _check_set(df, cond) - - cond = df >= 0 - _check_set(df, cond) - - # aligning - cond = (df >= 0)[1:] - _check_set(df, cond) - - # GH 10218 - # test DataFrame.where with Series slicing - df = DataFrame({"a": range(3), "b": range(4, 7)}) - result = df.where(df["a"] == 1) - expected = df[df["a"] == 1].reindex(df.index) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("klass", [list, tuple, np.array]) - def test_where_array_like(self, klass): - # see gh-15414 - df = DataFrame({"a": [1, 2, 3]}) - cond = [[False], [True], [True]] - expected = DataFrame({"a": [np.nan, 2, 3]}) - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - df["b"] = 2 - expected["b"] = [2, np.nan, 2] - cond = [[False, True], [True, False], [True, True]] - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "cond", - [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], - ], - ) - def test_where_invalid_input_single(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - @pytest.mark.parametrize( - "cond", - [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [ - [pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")], - ], - ], - ) - def test_where_invalid_input_multiple(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - def test_where_dataframe_col_match(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - cond = DataFrame([[True, False, True], [False, False, True]]) - - result = df.where(cond) - expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) - tm.assert_frame_equal(result, expected) - - # this *does* align, though has no matching columns - cond.columns = ["a", "b", "c"] - result = df.where(cond) - expected = DataFrame(np.nan, index=df.index, columns=df.columns) - tm.assert_frame_equal(result, expected) - - def test_where_ndframe_align(self): - msg = "Array conditional must be same shape as self" - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - - cond = [True] - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - cond = np.array([False, True, False, True]) - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - def test_where_bug(self): - # see gh-2793 - df = DataFrame( - {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" - ) - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_mixed(self, sint_dtype): - # see gh-2793 - df = DataFrame( - { - "a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), - } - ) - - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_transposition(self): - # see gh-7506 - a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) - b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - a = DataFrame({0: [4, 6], 1: [1, 0]}) - b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - def test_where_datetime(self): - - # GH 3311 - df = DataFrame( - dict( - A=date_range("20130102", periods=5), - B=date_range("20130104", periods=5), - C=np.random.randn(5), - ) - ) - - stamp = datetime(2013, 1, 3) - with pytest.raises(TypeError): - df > stamp - - result = df[df.iloc[:, :-1] > stamp] - - expected = df.copy() - expected.loc[[0, 1], "A"] = np.nan - expected.loc[:, "C"] = np.nan - tm.assert_frame_equal(result, expected) - - def test_where_none(self): - # GH 4667 - # setting with None changes dtype - df = DataFrame({"series": Series(range(10))}).astype(float) - df[df > 7] = None - expected = DataFrame( - {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} - ) - tm.assert_frame_equal(df, expected) - - # GH 7656 - df = DataFrame( - [ - {"A": 1, "B": np.nan, "C": "Test"}, - {"A": np.nan, "B": "Test", "C": np.nan}, - ] - ) - msg = "boolean setting on mixed-type" - - with pytest.raises(TypeError, match=msg): - df.where(~isna(df), None, inplace=True) - - def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): - # see gh-21947 - df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) - - result = df.where(cond) - tm.assert_frame_equal(result, df) - - def test_where_align(self): - def create(): - df = DataFrame(np.random.randn(10, 3)) - df.iloc[3:5, 0] = np.nan - df.iloc[4:6, 1] = np.nan - df.iloc[5:8, 2] = np.nan - return df - - # series - df = create() - expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis="columns") - tm.assert_frame_equal(result, expected) - - df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") - tm.assert_frame_equal(df, expected) - - df = create().fillna(0) - expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis="index") - tm.assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis="rows") - tm.assert_frame_equal(result, expected) - - # frame - df = create() - expected = df.fillna(1) - result = df.where( - pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) - ) - tm.assert_frame_equal(result, expected) - - def test_where_complex(self): - # GH 6345 - expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) - df[df.abs() >= 5] = np.nan - tm.assert_frame_equal(df, expected) - - def test_where_axis(self): - # GH 9736 - df = DataFrame(np.random.randn(2, 2)) - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, 1]) - - expected = DataFrame([[0, 0], [1, 1]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, 1], [0, 1]], dtype="float64") - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype="int64") - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, np.nan]) - - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Multiple dtypes (=> multiple Blocks) - df = pd.concat( - [ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), - ], - ignore_index=True, - axis=1, - ) - mask = DataFrame(False, columns=df.columns, index=df.index) - s1 = Series(1, index=df.columns) - s2 = Series(2, index=df.index) - - result = df.where(mask, s1, axis="columns") - expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s1, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - result = df.where(mask, s2, axis="index") - expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s2, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - # DataFrame vs DataFrame - d1 = df.copy().drop(1, axis=0) - expected = df.copy() - expected.loc[1, :] = np.nan - - result = df.where(mask, d1) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d1, axis="index") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True, axis="index") - tm.assert_frame_equal(result, expected) - - d2 = df.copy().drop(1, axis=1) - expected = df.copy() - expected.loc[:, 1] = np.nan - - result = df.where(mask, d2) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d2, axis="columns") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True, axis="columns") - tm.assert_frame_equal(result, expected) - - def test_where_callable(self): - # GH 12533 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = df.where(lambda x: x > 4, lambda x: x + 1) - exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df > 4, df + 1)) - - # return ndarray and scalar - result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) - exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) - - # chain - result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) - exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) - - def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), - columns=["date"], - ) - df2 = DataFrame( - DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - mask = DataFrame([True, True, False], columns=["date"]) - exp = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - result = df1.where(mask, df2) - tm.assert_frame_equal(exp, result) - def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0 @@ -3402,65 +2830,6 @@ def test_interval_index(self): tm.assert_series_equal(result, expected) -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - - def test_transpose(self, timezone_frame): - - result = timezone_frame.T - expected = DataFrame(timezone_frame.values.T) - expected.index = ["A", "B", "C"] - tm.assert_frame_equal(result, expected) - - def test_scalar_assignment(self): - # issue #19843 - df = pd.DataFrame(index=(0, 1, 2)) - df["now"] = pd.Timestamp("20130101", tz="UTC") - expected = pd.DataFrame( - {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] - ) - tm.assert_frame_equal(df, expected) - - class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -3509,383 +2878,3 @@ def test_transpose(self, uint64_frame): expected = DataFrame(uint64_frame.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) - - -class TestDataFrameIndexingCategorical: - def test_assignment(self): - # assignment - df = DataFrame( - {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} - ) - labels = Categorical( - ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - ) - - df = df.sort_values(by=["value"], ascending=True) - s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - d = s.values - df["D"] = d - str(df) - - result = df.dtypes - expected = Series( - [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], - index=["value", "D"], - ) - tm.assert_series_equal(result, expected) - - df["E"] = s - str(df) - - result = df.dtypes - expected = Series( - [ - np.dtype("int32"), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False), - ], - index=["value", "D", "E"], - ) - tm.assert_series_equal(result, expected) - - result1 = df["D"] - result2 = df["E"] - tm.assert_categorical_equal(result1._data._block.values, d) - - # sorting - s.name = "E" - tm.assert_series_equal(result2.sort_index(), s.sort_index()) - - cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) - df = DataFrame(Series(cat)) - - def test_assigning_ops(self): - # systematically test the assigning operations: - # for all slicing ops: - # for value in categories and value not in categories: - - # - assign a single value -> exp_single_cats_value - - # - assign a complete row (mixed values) -> exp_single_row - - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) - - # the expected values - # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - - # changed part of the cats column - cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) - - # changed single value in cats col - cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame( - {"cats": cats4, "values": values4}, index=idx4 - ) - - # iloc - # ############### - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.iloc[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.iloc[df.index == "j", 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, 0] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.iloc[2, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.iloc[2:4, 0] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", "cats"] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", "cats"] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", df.columns[0]] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", df.columns[0]] = ["c", "c"] - - # iat - df = orig.copy() - df.iat[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iat[2, 0] = "c" - - # at - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # fancy indexing - catsf = Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] - ) - idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) - valuesf = [1, 1, 3, 3, 1, 1, 1] - df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) - - exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) - - df[df["cats"] == "c"] = ["b", 2] - # category c is kept in .categories - tm.assert_frame_equal(df, exp_fancy) - - # set_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # Assigning a Category to parts of a int/... column uses the values of - # the Categorical - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) - df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) - - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) - - def test_loc_indexing_preserves_index_category_dtype(self): - # GH 15166 - df = DataFrame( - data=np.arange(2, 22, 2), - index=pd.MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - codes=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"], - ), - ) - - expected = pd.CategoricalIndex( - ["a", "b"], - categories=["a", "b"], - ordered=False, - name="Index1", - dtype="category", - ) - - result = df.index.levels[0] - tm.assert_index_equal(result, expected) - - result = df.loc[["a"]].index.levels[0] - tm.assert_index_equal(result, expected) - - def test_wrong_length_cat_dtype_raises(self): - # GH29523 - cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) - df = pd.DataFrame({"bar": range(10)}) - err = "Length of values does not match length of index" - with pytest.raises(ValueError, match=err): - df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py new file mode 100644 index 0000000000000..4fea190f28d7b --- /dev/null +++ b/pandas/tests/frame/indexing/test_where.py @@ -0,0 +1,582 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna +import pandas.util.testing as tm + + +class TestDataFrameIndexingWhere: + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + + def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) + + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) + + def _check_get(df, cond, check_dtypes=True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.items(): + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) + tm.assert_series_equal(v, exp, check_names=False) + tm.assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + assert (rs.dtypes == df.dtypes).all() + + # check getting + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + cond = df > 0 + _check_get(df, cond) + + # upcasting case (GH # 2794) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) + df.iloc[1, :] = 0 + result = df.dtypes + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) + + # when we don't preserve boolean casts + # + # expected = Series({ 'float32' : 1, 'float64' : 3 }) + + tm.assert_series_equal(result, expected) + + # aligning + def _check_align(df, cond, other, check_dtypes=True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if is_scalar(other): + o = other + else: + if isinstance(other, np.ndarray): + o = Series(other[:, i], index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values, index=result.index, name=k) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + tm.assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other, np.ndarray): + assert (rs.dtypes == df.dtypes).all() + + for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + + # invalid conditions + df = default_frame + err1 = (df + 1).values[0:2, :] + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) + + err2 = cond.iloc[:2, :].values + other1 = _safe_add(df) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) + + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) + + # where inplace + def _check_set(df, cond, check_dtypes=True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + + dfi.where(cond, np.nan, inplace=True) + tm.assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.items(): + if issubclass(v.type, np.integer) and not cond[k].all(): + v = np.dtype("float64") + assert dfi[k].dtype == v + + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + + # GH 10218 + # test DataFrame.where with Series slicing + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array]) + def test_where_array_like(self, klass): + # see gh-15414 + df = DataFrame({"a": [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({"a": [np.nan, 2, 3]}) + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + df["b"] = 2 + expected["b"] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) + def test_where_invalid_input_single(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) + def test_where_invalid_input_multiple(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + result = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(result, expected) + + # this *does* align, though has no matching columns + cond.columns = ["a", "b", "c"] + result = df.where(cond) + expected = DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + def test_where_bug(self): + # see gh-2793 + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_mixed(self, sint_dtype): + # see gh-2793 + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) + + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_transposition(self): + # see gh-7506 + a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) + b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + a = DataFrame({0: [4, 6], 1: [1, 0]}) + b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + def test_where_datetime(self): + + # GH 3311 + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) + + stamp = datetime(2013, 1, 3) + with pytest.raises(TypeError): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + + expected = df.copy() + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan + tm.assert_frame_equal(result, expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({"series": Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame( + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) + tm.assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" + + with pytest.raises(TypeError, match=msg): + df.where(~isna(df), None, inplace=True) + + def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): + # see gh-21947 + df = pd.DataFrame(columns=["a"]) + cond = df.applymap(lambda x: x > 0) + + result = df.where(cond) + tm.assert_frame_equal(result, df) + + def test_where_align(self): + def create(): + df = DataFrame(np.random.randn(10, 3)) + df.iloc[3:5, 0] = np.nan + df.iloc[4:6, 1] = np.nan + df.iloc[5:8, 2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notna(df), df.mean(), axis="columns") + tm.assert_frame_equal(result, expected) + + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + tm.assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) + result = df.where(df > 0, df[0], axis="index") + tm.assert_frame_equal(result, expected) + result = df.where(df > 0, df[0], axis="rows") + tm.assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) + df[df.abs() >= 5] = np.nan + tm.assert_frame_equal(df, expected) + + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]]) + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis="columns") + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis="index") + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d1, axis="index") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis="index") + tm.assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d2, axis="columns") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_where_callable(self): + # GH 12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.where(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df > 4, df + 1)) + + # return ndarray and scalar + result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) + + # chain + result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) + + def test_where_tz_values(self, tz_naive_fixture): + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + result = df1.where(mask, df2) + tm.assert_frame_equal(exp, result)