From 35423669a83aa7de53085595576ec02ee1ec6637 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Dec 2019 13:12:15 -0800 Subject: [PATCH 1/7] remove dummy file --- pandas/tests/test_compat.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 pandas/tests/test_compat.py diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py deleted file mode 100644 index 4ff8b0b31e85e..0000000000000 --- a/pandas/tests/test_compat.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Testing that functions from compat work as expected -""" From a18380727f86942f9adbf074e8b7be43dab1ea60 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Dec 2019 13:15:14 -0800 Subject: [PATCH 2/7] CLN --- pandas/tests/frame/test_repr_info.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 318b1c6add91e..b944b8f954f22 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -29,17 +29,17 @@ class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty - foo = repr(DataFrame()) # noqa + repr(DataFrame()) # empty with index frame = DataFrame(index=np.arange(1000)) - foo = repr(frame) # noqa + repr(frame) def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed - foo = repr(float_string_frame) # noqa + repr(float_string_frame) float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow @@ -51,7 +51,7 @@ def test_repr_mixed_big(self): biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan - foo = repr(biggie) # noqa + repr(biggie) def test_repr(self, float_frame): buf = StringIO() @@ -68,7 +68,7 @@ def test_repr(self, float_frame): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) - foo = repr(no_index) # noqa + repr(no_index) # no columns or index DataFrame().info(buf=buf) @@ -129,9 +129,6 @@ def test_repr_unsortable(self, float_frame): def test_repr_unicode(self): uval = "\u03c3\u03c3\u03c3\u03c3" - # TODO(wesm): is this supposed to be used? - bval = uval.encode("utf-8") # noqa - df = DataFrame({"A": [uval, uval]}) result = repr(df) From 58043b3af5d02d9f5fe64fce34d9857ac2dbd0c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Dec 2019 13:22:14 -0800 Subject: [PATCH 3/7] refactor out TestDescribe --- pandas/tests/frame/test_analytics.py | 939 +++++++++++++------------- pandas/tests/series/test_analytics.py | 4 +- 2 files changed, 472 insertions(+), 471 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index cef389a6c4167..938c531095584 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -258,526 +258,251 @@ def assert_bool_op_api( getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) -class TestDataFrameAnalytics: - - # --------------------------------------------------------------------- - # Correlation and covariance - - @td.skip_if_no_scipy - def test_corr_pearson(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "pearson") - - @td.skip_if_no_scipy - def test_corr_kendall(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "kendall") +class TestDescribe: + def test_describe_bool_in_mixed_frame(self): + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) - @td.skip_if_no_scipy - def test_corr_spearman(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan + # Integer data are included in .describe() output, + # Boolean and string data are not. + result = df.describe() + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) - self._check_method(float_frame, "spearman") + # Top value is a boolean value that is False + result = df.describe(include=["bool"]) - def _check_method(self, frame, method="pearson"): - correls = frame.corr(method=method) - expected = frame["A"].corr(frame["C"], method=method) - tm.assert_almost_equal(correls["A"]["C"], expected) + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) + tm.assert_frame_equal(result, expected) - @td.skip_if_no_scipy - def test_corr_non_numeric(self, float_frame, float_string_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan + def test_describe_empty_object(self): + # GH#27183 + df = pd.DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) - # exclude non-numeric types - result = float_string_frame.corr() - expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + result = df.iloc[:0].describe() tm.assert_frame_equal(result, expected) - @td.skip_if_no_scipy - @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) - def test_corr_nooverlap(self, meth): - # nothing in common - df = DataFrame( + def test_describe_bool_frame(self): + # GH#13891 + df = pd.DataFrame( { - "A": [1, 1.5, 1, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], - "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], } ) - rs = df.corr(meth) - assert isna(rs.loc["A", "B"]) - assert isna(rs.loc["B", "A"]) - assert rs.loc["A", "A"] == 1 - assert rs.loc["B", "B"] == 1 - assert isna(rs.loc["C", "C"]) - - @td.skip_if_no_scipy - @pytest.mark.parametrize("meth", ["pearson", "spearman"]) - def test_corr_constant(self, meth): - # constant --> all NA + result = df.describe() + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) - df = DataFrame( + df = pd.DataFrame( { - "A": [1, 1, 1, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, np.nan, 1, 1, 1], + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], } ) - rs = df.corr(meth) - assert isna(rs.values).all() - - def test_corr_int(self): - # dtypes other than float64 #1761 - df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - - df3.cov() - df3.corr() - - @td.skip_if_no_scipy - def test_corr_int_and_boolean(self): - # when dtypes of pandas series are different - # then ndarray will have dtype=object, - # so it need to be properly handled - df = DataFrame({"a": [True, False], "b": [1, 0]}) - - expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) - for meth in ["pearson", "kendall", "spearman"]: - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - result = df.corr(meth) - tm.assert_frame_equal(result, expected) - - def test_corr_cov_independent_index_column(self): - # GH 14617 - df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) - for method in ["cov", "corr"]: - result = getattr(df, method)() - assert result.index is not result.columns - assert result.index.equals(result.columns) - - def test_corr_invalid_method(self): - # GH 22298 - df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " - with pytest.raises(ValueError, match=msg): - df.corr(method="____") + result = df.describe() + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) - def test_cov(self, float_frame, float_string_frame): - # min_periods no NAs (corner case) - expected = float_frame.cov() - result = float_frame.cov(min_periods=len(float_frame)) + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) + result = df.describe() + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(expected, result) + def test_describe_categorical(self): + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) - result = float_frame.cov(min_periods=len(float_frame) + 1) - assert isna(result.values).all() + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + cat = df - # with NAs - frame = float_frame.copy() - frame["A"][:5] = np.nan - frame["B"][5:10] = np.nan - result = float_frame.cov(min_periods=len(float_frame) - 8) - expected = float_frame.cov() - expected.loc["A", "B"] = np.nan - expected.loc["B", "A"] = np.nan + # Categoricals should not show up together with numerical columns + result = cat.describe() + assert len(result.columns) == 1 - # regular - float_frame["A"][:5] = np.nan - float_frame["B"][:10] = np.nan - cov = float_frame.cov() + # In a frame, describe() for the cat should be the same as for string + # arrays (count, unique, top, freq) - tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) + s = Series(cat) + result = s.describe() + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) + tm.assert_series_equal(result, expected) - # exclude non-numeric types - result = float_string_frame.cov() - expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() - tm.assert_frame_equal(result, expected) + cat = Series(Categorical(["a", "b", "c", "c"])) + df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) + result = df3.describe() + tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - # Single column frame - df = DataFrame(np.linspace(0.0, 1.0, 10)) - result = df.cov() + def test_describe_empty_categorical_column(self): + # GH#26397 + # Ensure the index of an an empty categorical DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() expected = DataFrame( - np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", ) tm.assert_frame_equal(result, expected) - df.loc[0] = np.nan - result = df.cov() - expected = DataFrame( - np.cov(df.values[1:].T).reshape((1, 1)), - index=df.columns, - columns=df.columns, + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) + + def test_describe_categorical_columns(self): + # GH#11558 + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, ) - tm.assert_frame_equal(result, expected) + result = df.describe() - def test_corrwith(self, datetime_frame): - a = datetime_frame - noise = Series(np.random.randn(len(a)), index=a.index) + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) - b = datetime_frame.add(noise, axis=0) + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) - # make sure order does not matter - b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) - del b["B"] - - colcorr = a.corrwith(b, axis=0) - tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) - - rowcorr = a.corrwith(b, axis=1) - tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) - - dropped = a.corrwith(b, axis=0, drop=True) - tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) - assert "B" not in dropped - - dropped = a.corrwith(b, axis=1, drop=True) - assert a.index[-1] not in dropped.index - - # non time-series data - index = ["a", "b", "c", "d", "e"] - columns = ["one", "two", "three", "four"] - df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) - df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) - correls = df1.corrwith(df2, axis=1) - for row in index[:4]: - tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - - def test_corrwith_with_objects(self): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame() - cols = ["A", "B", "C", "D"] - - df1["obj"] = "foo" - df2["obj"] = "bar" - - result = df1.corrwith(df2) - expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) - tm.assert_series_equal(result, expected) - - result = df1.corrwith(df2, axis=1) - expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) - tm.assert_series_equal(result, expected) - - def test_corrwith_series(self, datetime_frame): - result = datetime_frame.corrwith(datetime_frame["A"]) - expected = datetime_frame.apply(datetime_frame["A"].corr) - - tm.assert_series_equal(result, expected) - - def test_corrwith_matches_corrcoef(self): - df1 = DataFrame(np.arange(10000), columns=["a"]) - df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) - c1 = df1.corrwith(df2)["a"] - c2 = np.corrcoef(df1["a"], df2["a"])[0][1] - - tm.assert_almost_equal(c1, c2) - assert c1 < 1 - - def test_corrwith_mixed_dtypes(self): - # GH 18570 - df = pd.DataFrame( - {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", ) - s = pd.Series([0, 6, 7, 3]) - result = df.corrwith(s) - corrs = [df["a"].corr(s), df["b"].corr(s)] - expected = pd.Series(data=corrs, index=["a", "b"]) - tm.assert_series_equal(result, expected) - - def test_corrwith_index_intersection(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) - - result = df1.corrwith(df2, drop=True).index.sort_values() - expected = df1.columns.intersection(df2.columns).sort_values() - tm.assert_index_equal(result, expected) - - def test_corrwith_index_union(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) - - result = df1.corrwith(df2, drop=False).index.sort_values() - expected = df1.columns.union(df2.columns).sort_values() - tm.assert_index_equal(result, expected) - - def test_corrwith_dup_cols(self): - # GH 21925 - df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) - df2 = df1.copy() - df2 = pd.concat((df2, df2[0]), axis=1) - - result = df1.corrwith(df2) - expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) - tm.assert_series_equal(result, expected) - - @td.skip_if_no_scipy - def test_corrwith_spearman(self): - # GH 21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df ** 2, method="spearman") - expected = Series(np.ones(len(result))) - tm.assert_series_equal(result, expected) - - @td.skip_if_no_scipy - def test_corrwith_kendall(self): - # GH 21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df ** 2, method="kendall") - expected = Series(np.ones(len(result))) - tm.assert_series_equal(result, expected) - - # --------------------------------------------------------------------- - # Describe - - def test_bool_describe_in_mixed_frame(self): df = DataFrame( { - "string_data": ["a", "b", "c", "d", "e"], - "bool_data": [True, True, False, False, False], - "int_data": [10, 20, 30, 40, 50], + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], } ) - - # Integer data are included in .describe() output, - # Boolean and string data are not. + df.columns = columns result = df.describe() - expected = DataFrame( - {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - # Top value is a boolean value that is False - result = df.describe(include=["bool"]) - - expected = DataFrame( - {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" ) - tm.assert_frame_equal(result, expected) - - def test_describe_empty_object(self): - # https://github.com/pandas-dev/pandas/issues/27183 - df = pd.DataFrame({"A": [None, None]}, dtype=object) - result = df.describe() - expected = pd.DataFrame( - {"A": [0, 0, np.nan, np.nan]}, - dtype=object, - index=["count", "unique", "top", "freq"], + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) + expected.columns = exp_columns tm.assert_frame_equal(result, expected) + assert result.columns.freq == "MS" + assert result.columns.tz == expected.columns.tz - result = df.iloc[:0].describe() - tm.assert_frame_equal(result, expected) + def test_describe_timedelta_values(self): + # GH#6145 + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) - def test_describe_bool_frame(self): - # GH 13891 - df = pd.DataFrame( - { - "bool_data_1": [False, False, True, True], - "bool_data_2": [False, True, True, True], - } - ) - result = df.describe() expected = DataFrame( - {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( { - "bool_data": [False, False, True, True, False], - "int_data": [0, 1, 2, 3, 4], - } - ) - result = df.describe() - expected = DataFrame( - {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) - tm.assert_frame_equal(result, expected) - df = pd.DataFrame( - {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} - ) result = df.describe() - expected = DataFrame( - {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, - index=["count", "unique", "top", "freq"], - ) tm.assert_frame_equal(result, expected) - def test_describe_categorical(self): - df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=["value"], ascending=True) - df["value_group"] = pd.cut( - df.value, range(0, 10500, 500), right=False, labels=cat_labels + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" ) - cat = df - - # Categoricals should not show up together with numerical columns - result = cat.describe() - assert len(result.columns) == 1 - - # In a frame, describe() for the cat should be the same as for string - # arrays (count, unique, top, freq) + assert repr(result) == exp_repr - cat = Categorical( - ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True - ) - s = Series(cat) - result = s.describe() - expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) - tm.assert_series_equal(result, expected) - - cat = Series(Categorical(["a", "b", "c", "c"])) - df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) - result = df3.describe() - tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - - def test_describe_empty_categorical_column(self): - # GH 26397 - # Ensure the index of an an empty categorical DataFrame column - # also contains (count, unique, top, freq) - df = pd.DataFrame({"empty_col": Categorical([])}) - result = df.describe() - expected = DataFrame( - {"empty_col": [0, 0, np.nan, np.nan]}, - index=["count", "unique", "top", "freq"], - dtype="object", - ) - tm.assert_frame_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2, 0]) - assert np.isnan(result.iloc[3, 0]) - - def test_describe_categorical_columns(self): - # GH 11558 - columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") - df = DataFrame( - { - "int1": [10, 20, 30, 40, 50], - "int2": [10, 20, 30, 40, 50], - "obj": ["A", 0, None, "X", 1], - }, - columns=columns, - ) - result = df.describe() - - exp_columns = pd.CategoricalIndex( - ["int1", "int2"], - categories=["int1", "int2", "obj"], - ordered=True, - name="XXX", - ) - expected = DataFrame( - { - "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], - "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - columns=exp_columns, - ) - - tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, expected.columns.values) - - def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01", "2011-03-01"], - freq="MS", - tz="US/Eastern", - name="XXX", - ) - df = DataFrame( - { - 0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ["A", 0, None, "X", 1], - } - ) - df.columns = columns - result = df.describe() - - exp_columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" - ) - expected = DataFrame( - { - 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - expected.columns = exp_columns - tm.assert_frame_equal(result, expected) - assert result.columns.freq == "MS" - assert result.columns.tz == expected.columns.tz - - def test_describe_timedelta_values(self): - # GH 6145 - t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) - df = pd.DataFrame({"t1": t1, "t2": t2}) - - expected = DataFrame( - { - "t1": [ - 5, - pd.Timedelta("3 days"), - df.iloc[:, 0].std(), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - pd.Timedelta("4 days"), - pd.Timedelta("5 days"), - ], - "t2": [ - 5, - pd.Timedelta("3 hours"), - df.iloc[:, 1].std(), - pd.Timedelta("1 hours"), - pd.Timedelta("2 hours"), - pd.Timedelta("3 hours"), - pd.Timedelta("4 hours"), - pd.Timedelta("5 hours"), - ], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - - result = df.describe() - tm.assert_frame_equal(result, expected) - - exp_repr = ( - " t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00" - ) - assert repr(result) == exp_repr - - def test_describe_tz_values(self, tz_naive_fixture): - # GH 21332 - tz = tz_naive_fixture - s1 = Series(range(5)) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) + def test_describe_tz_values(self, tz_naive_fixture): + # GH#21332 + tz = tz_naive_fixture + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) expected = DataFrame( { @@ -832,7 +557,7 @@ def test_describe_tz_values(self, tz_naive_fixture): tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): - # Issue 26660 + # GH#26660 df = pd.DataFrame({"x": [1]}) pct = np.linspace(0, 1, 10 + 1) result = df.describe(percentiles=pct) @@ -860,6 +585,280 @@ def test_describe_percentiles_integer_idx(self): ) tm.assert_frame_equal(result, expected) + +class TestDataFrameAnalytics: + + # --------------------------------------------------------------------- + # Correlation and covariance + + @td.skip_if_no_scipy + def test_corr_pearson(self, float_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + self._check_method(float_frame, "pearson") + + @td.skip_if_no_scipy + def test_corr_kendall(self, float_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + self._check_method(float_frame, "kendall") + + @td.skip_if_no_scipy + def test_corr_spearman(self, float_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + self._check_method(float_frame, "spearman") + + def _check_method(self, frame, method="pearson"): + correls = frame.corr(method=method) + expected = frame["A"].corr(frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) + + @td.skip_if_no_scipy + def test_corr_non_numeric(self, float_frame, float_string_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + # exclude non-numeric types + result = float_string_frame.corr() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + rs = df.corr(meth) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) + def test_corr_constant(self, meth): + # constant --> all NA + + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) + rs = df.corr(meth) + assert isna(rs.values).all() + + def test_corr_int(self): + # dtypes other than float64 #1761 + df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + df3.cov() + df3.corr() + + @td.skip_if_no_scipy + def test_corr_int_and_boolean(self): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + for meth in ["pearson", "kendall", "spearman"]: + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) + + def test_corr_cov_independent_index_column(self): + # GH 14617 + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + for method in ["cov", "corr"]: + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + + def test_corr_invalid_method(self): + # GH 22298 + df = pd.DataFrame(np.random.normal(size=(10, 2))) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + df.corr(method="____") + + def test_cov(self, float_frame, float_string_frame): + # min_periods no NAs (corner case) + expected = float_frame.cov() + result = float_frame.cov(min_periods=len(float_frame)) + + tm.assert_frame_equal(expected, result) + + result = float_frame.cov(min_periods=len(float_frame) + 1) + assert isna(result.values).all() + + # with NAs + frame = float_frame.copy() + frame["A"][:5] = np.nan + frame["B"][5:10] = np.nan + result = float_frame.cov(min_periods=len(float_frame) - 8) + expected = float_frame.cov() + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan + + # regular + float_frame["A"][:5] = np.nan + float_frame["B"][:10] = np.nan + cov = float_frame.cov() + + tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) + + # exclude non-numeric types + result = float_string_frame.cov() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() + tm.assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0, 1.0, 10)) + result = df.cov() + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) + tm.assert_frame_equal(result, expected) + df.loc[0] = np.nan + result = df.cov() + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + def test_corrwith(self, datetime_frame): + a = datetime_frame + noise = Series(np.random.randn(len(a)), index=a.index) + + b = datetime_frame.add(noise, axis=0) + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b["B"] + + colcorr = a.corrwith(b, axis=0) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) + + rowcorr = a.corrwith(b, axis=1) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped + + dropped = a.corrwith(b, axis=1, drop=True) + assert a.index[-1] not in dropped.index + + # non time-series data + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) + df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ["A", "B", "C", "D"] + + df1["obj"] = "foo" + df2["obj"] = "bar" + + result = df1.corrwith(df2) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) + tm.assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) + tm.assert_series_equal(result, expected) + + def test_corrwith_series(self, datetime_frame): + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) + + tm.assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] + + tm.assert_almost_equal(c1, c2) + assert c1 < 1 + + def test_corrwith_mixed_dtypes(self): + # GH 18570 + df = pd.DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) + s = pd.Series([0, 6, 7, 3]) + result = df.corrwith(s) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = pd.Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_index_intersection(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=True).index.sort_values() + expected = df1.columns.intersection(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_index_union(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=False).index.sort_values() + expected = df1.columns.union(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_dup_cols(self): + # GH 21925 + df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_spearman(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_kendall(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + # --------------------------------------------------------------------- # Reductions diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0eb4e8a6cfdf3..2a09cac90d5ba 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -24,7 +24,7 @@ import pandas.util.testing as tm -class TestSeriesAnalytics: +class TestDescribe: def test_describe(self): s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() @@ -88,6 +88,8 @@ def test_describe_with_tz(self, tz_naive_fixture): ) tm.assert_series_equal(result, expected) + +class TestSeriesAnalytics: def test_argsort(self, datetime_series): self._check_accum_op("argsort", datetime_series, check_dtype=False) argsorted = datetime_series.argsort() From c9f3348a583b23ad4755a1d60e0bacfe441fe0ca Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Dec 2019 13:38:28 -0800 Subject: [PATCH 4/7] refactor out TestDescrbie --- pandas/tests/frame/test_analytics.py | 2 +- pandas/tests/series/test_analytics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 938c531095584..f97ec309ed1c2 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -258,7 +258,7 @@ def assert_bool_op_api( getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) -class TestDescribe: +class TestDataFrameDescribe: def test_describe_bool_in_mixed_frame(self): df = DataFrame( { diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 2a09cac90d5ba..5cc67f19e45f0 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -24,7 +24,7 @@ import pandas.util.testing as tm -class TestDescribe: +class TestSeriesDescribe: def test_describe(self): s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() From 9c4f53c29ccc911c7e2d20cf65a9033b6c9f0f20 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 23 Dec 2019 12:46:48 -0800 Subject: [PATCH 5/7] revert --- pandas/tests/frame/test_analytics.py | 661 --------------------------- 1 file changed, 661 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e119a28827606..1a241cd72ec43 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -257,334 +257,6 @@ def assert_bool_op_api( getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) -class TestDataFrameDescribe: - def test_describe_bool_in_mixed_frame(self): - df = DataFrame( - { - "string_data": ["a", "b", "c", "d", "e"], - "bool_data": [True, True, False, False, False], - "int_data": [10, 20, 30, 40, 50], - } - ) - - # Integer data are included in .describe() output, - # Boolean and string data are not. - result = df.describe() - expected = DataFrame( - {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - # Top value is a boolean value that is False - result = df.describe(include=["bool"]) - - expected = DataFrame( - {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] - ) - tm.assert_frame_equal(result, expected) - - def test_describe_empty_object(self): - # GH#27183 - df = pd.DataFrame({"A": [None, None]}, dtype=object) - result = df.describe() - expected = pd.DataFrame( - {"A": [0, 0, np.nan, np.nan]}, - dtype=object, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - result = df.iloc[:0].describe() - tm.assert_frame_equal(result, expected) - - def test_describe_bool_frame(self): - # GH#13891 - df = pd.DataFrame( - { - "bool_data_1": [False, False, True, True], - "bool_data_2": [False, True, True, True], - } - ) - result = df.describe() - expected = DataFrame( - {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - { - "bool_data": [False, False, True, True, False], - "int_data": [0, 1, 2, 3, 4], - } - ) - result = df.describe() - expected = DataFrame( - {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} - ) - result = df.describe() - expected = DataFrame( - {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - def test_describe_categorical(self): - df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=["value"], ascending=True) - df["value_group"] = pd.cut( - df.value, range(0, 10500, 500), right=False, labels=cat_labels - ) - cat = df - - # Categoricals should not show up together with numerical columns - result = cat.describe() - assert len(result.columns) == 1 - - # In a frame, describe() for the cat should be the same as for string - # arrays (count, unique, top, freq) - - cat = Categorical( - ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True - ) - s = Series(cat) - result = s.describe() - expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) - tm.assert_series_equal(result, expected) - - cat = Series(Categorical(["a", "b", "c", "c"])) - df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) - result = df3.describe() - tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - - def test_describe_empty_categorical_column(self): - # GH#26397 - # Ensure the index of an an empty categorical DataFrame column - # also contains (count, unique, top, freq) - df = pd.DataFrame({"empty_col": Categorical([])}) - result = df.describe() - expected = DataFrame( - {"empty_col": [0, 0, np.nan, np.nan]}, - index=["count", "unique", "top", "freq"], - dtype="object", - ) - tm.assert_frame_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2, 0]) - assert np.isnan(result.iloc[3, 0]) - - def test_describe_categorical_columns(self): - # GH#11558 - columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") - df = DataFrame( - { - "int1": [10, 20, 30, 40, 50], - "int2": [10, 20, 30, 40, 50], - "obj": ["A", 0, None, "X", 1], - }, - columns=columns, - ) - result = df.describe() - - exp_columns = pd.CategoricalIndex( - ["int1", "int2"], - categories=["int1", "int2", "obj"], - ordered=True, - name="XXX", - ) - expected = DataFrame( - { - "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], - "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - columns=exp_columns, - ) - - tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, expected.columns.values) - - def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01", "2011-03-01"], - freq="MS", - tz="US/Eastern", - name="XXX", - ) - df = DataFrame( - { - 0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ["A", 0, None, "X", 1], - } - ) - df.columns = columns - result = df.describe() - - exp_columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" - ) - expected = DataFrame( - { - 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - expected.columns = exp_columns - tm.assert_frame_equal(result, expected) - assert result.columns.freq == "MS" - assert result.columns.tz == expected.columns.tz - - def test_describe_timedelta_values(self): - # GH#6145 - t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) - df = pd.DataFrame({"t1": t1, "t2": t2}) - - expected = DataFrame( - { - "t1": [ - 5, - pd.Timedelta("3 days"), - df.iloc[:, 0].std(), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - pd.Timedelta("4 days"), - pd.Timedelta("5 days"), - ], - "t2": [ - 5, - pd.Timedelta("3 hours"), - df.iloc[:, 1].std(), - pd.Timedelta("1 hours"), - pd.Timedelta("2 hours"), - pd.Timedelta("3 hours"), - pd.Timedelta("4 hours"), - pd.Timedelta("5 hours"), - ], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - - result = df.describe() - tm.assert_frame_equal(result, expected) - - exp_repr = ( - " t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00" - ) - assert repr(result) == exp_repr - - def test_describe_tz_values(self, tz_naive_fixture): - # GH#21332 - tz = tz_naive_fixture - s1 = Series(range(5)) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) - - expected = DataFrame( - { - "s1": [ - 5, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 2, - 1.581139, - 0, - 1, - 2, - 3, - 4, - ], - "s2": [ - 5, - 5, - s2.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], - }, - index=[ - "count", - "unique", - "top", - "freq", - "first", - "last", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ], - ) - result = df.describe(include="all") - tm.assert_frame_equal(result, expected) - - def test_describe_percentiles_integer_idx(self): - # GH#26660 - df = pd.DataFrame({"x": [1]}) - pct = np.linspace(0, 1, 10 + 1) - result = df.describe(percentiles=pct) - - expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, - index=[ - "count", - "mean", - "std", - "min", - "0%", - "10%", - "20%", - "30%", - "40%", - "50%", - "60%", - "70%", - "80%", - "90%", - "100%", - "max", - ], - ) - tm.assert_frame_equal(result, expected) - - class TestDataFrameAnalytics: # --------------------------------------------------------------------- @@ -1780,187 +1452,6 @@ def test_any_all_level_axis_none_raises(self, method): with pytest.raises(ValueError, match=xpr): getattr(df, method)(axis=None, level="out") - # ---------------------------------------------------------------------- - # Isin - - def test_isin(self): - # GH 4211 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - other = ["a", "b", "c"] - - result = df.isin(other) - expected = DataFrame([df.loc[s].isin(other) for s in df.index]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) - def test_isin_empty(self, empty): - # GH 16991 - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - expected = DataFrame(False, df.index, df.columns) - - result = df.isin(empty) - tm.assert_frame_equal(result, expected) - - def test_isin_dict(self): - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - d = {"A": ["a"]} - - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - # non unique columns - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - df.columns = ["A", "A"] - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - def test_isin_with_string_scalar(self): - # GH 4763 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - with pytest.raises(TypeError): - df.isin("a") - - with pytest.raises(TypeError): - df.isin("aaa") - - def test_isin_df(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) - expected = DataFrame(False, df1.index, df1.columns) - result = df1.isin(df2) - expected["A"].loc[[1, 3]] = True - expected["B"].loc[[0, 2]] = True - tm.assert_frame_equal(result, expected) - - # partial overlapping columns - df2.columns = ["A", "C"] - result = df1.isin(df2) - expected["B"] = False - tm.assert_frame_equal(result, expected) - - def test_isin_tuples(self): - # GH 16394 - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) - df["C"] = list(zip(df["A"], df["B"])) - result = df["C"].isin([(1, "a")]) - tm.assert_series_equal(result, Series([True, False, False], name="C")) - - def test_isin_df_dupe_values(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - # just cols duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) - with pytest.raises(ValueError): - df1.isin(df2) - - # just index duped - df2 = DataFrame( - [[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=["A", "B"], - index=[0, 0, 1, 1], - ) - with pytest.raises(ValueError): - df1.isin(df2) - - # cols and index: - df2.columns = ["B", "B"] - with pytest.raises(ValueError): - df1.isin(df2) - - def test_isin_dupe_self(self): - other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) - df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) - result = df.isin(other) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected.loc[0] = True - expected.iloc[1, 1] = True - tm.assert_frame_equal(result, expected) - - def test_isin_against_series(self): - df = pd.DataFrame( - {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] - ) - s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected["A"].loc["a"] = True - expected.loc["d"] = True - result = df.isin(s) - tm.assert_frame_equal(result, expected) - - def test_isin_multiIndex(self): - idx = MultiIndex.from_tuples( - [ - (0, "a", "foo"), - (0, "a", "bar"), - (0, "b", "bar"), - (0, "b", "baz"), - (2, "a", "foo"), - (2, "a", "bar"), - (2, "c", "bar"), - (2, "c", "baz"), - (1, "b", "foo"), - (1, "b", "bar"), - (1, "c", "bar"), - (1, "c", "baz"), - ] - ) - df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) - df2 = DataFrame( - { - "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], - "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], - } - ) - # against regular index - expected = DataFrame(False, index=df1.index, columns=df1.columns) - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - df2.index = idx - expected = df2.values.astype(np.bool) - expected[:, 1] = ~expected[:, 1] - expected = DataFrame(expected, columns=["A", "B"], index=idx) - - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - def test_isin_empty_datetimelike(self): - # GH 15473 - df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) - df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) - df2 = DataFrame({"date": []}) - df3 = DataFrame() - - expected = DataFrame({"date": [False, False]}) - - result = df1_ts.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_ts.isin(df3) - tm.assert_frame_equal(result, expected) - - result = df1_td.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_td.isin(df3) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Rounding @@ -2173,158 +1664,6 @@ def test_round_interval_category_columns(self): expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns) tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- - # Clip - - def test_clip(self, float_frame): - median = float_frame.median().median() - original = float_frame.copy() - - double = float_frame.clip(upper=median, lower=median) - assert not (double.values != median).any() - - # Verify that float_frame was not changed inplace - assert (float_frame.values == original.values).all() - - def test_inplace_clip(self, float_frame): - # GH 15388 - median = float_frame.median().median() - frame_copy = float_frame.copy() - - frame_copy.clip(upper=median, lower=median, inplace=True) - assert not (frame_copy.values != median).any() - - def test_dataframe_clip(self): - # GH 2747 - df = DataFrame(np.random.randn(1000, 2)) - - for lb, ub in [(-1, 1), (1, -1)]: - clipped_df = df.clip(lb, ub) - - lb, ub = min(lb, ub), max(ub, lb) - lb_mask = df.values <= lb - ub_mask = df.values >= ub - mask = ~lb_mask & ~ub_mask - assert (clipped_df.values[lb_mask] == lb).all() - assert (clipped_df.values[ub_mask] == ub).all() - assert (clipped_df.values[mask] == df.values[mask]).all() - - def test_clip_mixed_numeric(self): - # TODO(jreback) - # clip on mixed integer or floats - # with integer clippers coerces to float - df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) - result = df.clip(1, 2) - expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) - tm.assert_frame_equal(result, expected, check_like=True) - - # GH 24162, clipping now preserves numeric types per column - df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) - expected = df.dtypes - result = df.clip(upper=3).dtypes - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("inplace", [True, False]) - def test_clip_against_series(self, inplace): - # GH 6966 - - df = DataFrame(np.random.randn(1000, 2)) - lb = Series(np.random.randn(1000)) - ub = lb + 1 - - original = df.copy() - clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) - - if inplace: - clipped_df = df - - for i in range(2): - lb_mask = original.iloc[:, i] <= lb - ub_mask = original.iloc[:, i] >= ub - mask = ~lb_mask & ~ub_mask - - result = clipped_df.loc[lb_mask, i] - tm.assert_series_equal(result, lb[lb_mask], check_names=False) - assert result.name == i - - result = clipped_df.loc[ub_mask, i] - tm.assert_series_equal(result, ub[ub_mask], check_names=False) - assert result.name == i - - tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) - @pytest.mark.parametrize( - "axis,res", - [ - (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), - (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), - ], - ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): - # GH 15390 - original = simple_frame.copy(deep=True) - - result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - - expected = pd.DataFrame(res, columns=original.columns, index=original.index) - if inplace: - result = original - tm.assert_frame_equal(result, expected, check_exact=True) - - @pytest.mark.parametrize("axis", [0, 1, None]) - def test_clip_against_frame(self, axis): - df = DataFrame(np.random.randn(1000, 2)) - lb = DataFrame(np.random.randn(1000, 2)) - ub = lb + 1 - - clipped_df = df.clip(lb, ub, axis=axis) - - lb_mask = df <= lb - ub_mask = df >= ub - mask = ~lb_mask & ~ub_mask - - tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) - tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) - tm.assert_frame_equal(clipped_df[mask], df[mask]) - - def test_clip_against_unordered_columns(self): - # GH 20911 - df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) - df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) - df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) - result_upper = df1.clip(lower=0, upper=df2) - expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) - result_lower = df1.clip(lower=df3, upper=3) - expected_lower = df1.clip(lower=df3[df1.columns], upper=3) - result_lower_upper = df1.clip(lower=df3, upper=df2) - expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) - tm.assert_frame_equal(result_upper, expected_upper) - tm.assert_frame_equal(result_lower, expected_lower) - tm.assert_frame_equal(result_lower_upper, expected_lower_upper) - - def test_clip_with_na_args(self, float_frame): - """Should process np.nan argument as None """ - # GH 17276 - tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) - tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) - - # GH 19992 - df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - - result = df.clip(lower=[4, 5, np.nan], axis=0) - expected = DataFrame( - {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} - ) - tm.assert_frame_equal(result, expected) - - result = df.clip(lower=[4, 5, np.nan], axis=1) - expected = DataFrame( - {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} - ) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Matrix-like From 9bf172be9a19161bfd63656a26df15e6595b87f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Feb 2020 17:37:16 -0800 Subject: [PATCH 6/7] CLN: generic tests --- pandas/tests/frame/test_block_internals.py | 20 +++--- pandas/tests/generic/test_frame.py | 76 +++++++++++++++++++--- pandas/tests/generic/test_generic.py | 35 ---------- pandas/tests/generic/test_series.py | 62 ++++++++++++------ 4 files changed, 118 insertions(+), 75 deletions(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index d301ed969789e..a5f5e6f36cd58 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -364,14 +364,14 @@ def test_pickle(self, float_string_frame, timezone_frame): def test_consolidate_datetime64(self): # numpy vstack bug - data = """\ -starting,ending,measure -2012-06-21 00:00,2012-06-23 07:00,77 -2012-06-23 07:00,2012-06-23 16:30,65 -2012-06-23 16:30,2012-06-25 08:00,77 -2012-06-25 08:00,2012-06-26 12:00,0 -2012-06-26 12:00,2012-06-27 08:00,77 -""" + data = ( + "starting,ending,measure\n" + "2012-06-21 00:00,2012-06-23 07:00,77\n" + "2012-06-23 07:00,2012-06-23 16:30,65\n" + "2012-06-23 16:30,2012-06-25 08:00,77\n" + "2012-06-25 08:00,2012-06-26 12:00,0\n" + "2012-06-26 12:00,2012-06-27 08:00,77\n" + ) df = pd.read_csv(StringIO(data), parse_dates=[0, 1]) ser_starting = df.starting @@ -397,9 +397,6 @@ def test_is_mixed_type(self, float_frame, float_string_frame): assert float_string_frame._is_mixed_type def test_get_numeric_data(self): - # TODO(wesm): unused? - intname = np.dtype(np.int_).name # noqa - floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype("M8[ns]").name objectname = np.dtype(np.object_).name @@ -581,6 +578,7 @@ def test_get_X_columns(self): tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"])) def test_strange_column_corruption_issue(self): + # FIXME: dont leave commented-out # (wesm) Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 7fe22e77c5bf3..72e13db03f190 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -160,7 +160,7 @@ def finalize(self, other, method=None, **kwargs): # reset DataFrame._metadata = _metadata - DataFrame.__finalize__ = _finalize + DataFrame.__finalize__ = _finalize # FIXME: use monkeypatch def test_set_attribute(self): # Test for consistent setattr behavior when an attribute and a column @@ -174,6 +174,72 @@ def test_set_attribute(self): assert df.y == 5 tm.assert_series_equal(df["y"], Series([2, 4, 6], name="y")) + def test_deepcopy_empty(self): + # This test covers empty frame copying with non-empty column sets + # as reported in issue GH15370 + empty_frame = DataFrame(data=[], index=[], columns=["A"]) + empty_frame_copy = deepcopy(empty_frame) + + self._compare(empty_frame_copy, empty_frame) + + +# formerly in Generic but only test DataFrame +class TestDataFrame2: + def test_validate_bool_args(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + invalid_values = [1, "True", [1, 2, 3], 5.0] + + for value in invalid_values: + with pytest.raises(ValueError): + super(DataFrame, df).rename_axis( + mapper={"a": "x", "b": "y"}, axis=1, inplace=value + ) + + with pytest.raises(ValueError): + super(DataFrame, df).drop("a", axis=1, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).sort_index(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df)._consolidate(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).fillna(value=0, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).interpolate(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df)._where(cond=df.a > 2, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).mask(cond=df.a > 2, inplace=value) + + def test_unexpected_keyword(self): + # GH8597 + df = DataFrame(np.random.randn(5, 2), columns=["jim", "joe"]) + ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) + ts = df["joe"].copy() + ts[2] = np.nan + + with pytest.raises(TypeError, match="unexpected keyword"): + df.drop("joe", axis=1, in_place=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + df.reindex([1, 0], inplace=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + ca.fillna(0, inplace=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + ts.fillna(0, in_place=True) + + +class TestToXArray: @pytest.mark.skipif( not _XARRAY_INSTALLED or _XARRAY_INSTALLED @@ -272,11 +338,3 @@ def test_to_xarray(self): expected["f"] = expected["f"].astype(object) expected.columns.name = None tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_deepcopy_empty(self): - # This test covers empty frame copying with non-empty column sets - # as reported in issue GH15370 - empty_frame = DataFrame(data=[], index=[], columns=["A"]) - empty_frame_copy = deepcopy(empty_frame) - - self._compare(empty_frame_copy, empty_frame) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index efb04c7f63c66..4533afb05179b 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -103,23 +103,6 @@ def test_get_numeric_data(self): # _get_numeric_data is includes _get_bool_data, so can't test for # non-inclusion - def test_get_default(self): - - # GH 7725 - d0 = "a", "b", "c", "d" - d1 = np.arange(4, dtype="int64") - others = "e", 10 - - for data, index in ((d0, d1), (d1, d0)): - s = Series(data, index=index) - for i, d in zip(index, data): - assert s.get(i) == d - assert s.get(i, d) == d - assert s.get(i, "z") == d - for other in others: - assert s.get(other, "z") == "z" - assert s.get(other, other) == other - def test_nonzero(self): # GH 4633 @@ -460,24 +443,6 @@ def test_split_compat(self): assert len(np.array_split(o, 5)) == 5 assert len(np.array_split(o, 2)) == 2 - def test_unexpected_keyword(self): # GH8597 - df = DataFrame(np.random.randn(5, 2), columns=["jim", "joe"]) - ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) - ts = df["joe"].copy() - ts[2] = np.nan - - with pytest.raises(TypeError, match="unexpected keyword"): - df.drop("joe", axis=1, in_place=True) - - with pytest.raises(TypeError, match="unexpected keyword"): - df.reindex([1, 0], inplace=True) - - with pytest.raises(TypeError, match="unexpected keyword"): - ca.fillna(0, inplace=True) - - with pytest.raises(TypeError, match="unexpected keyword"): - ts.fillna(0, in_place=True) - # See gh-12301 def test_stat_unexpected_keyword(self): obj = self._construct(5) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 8ad8355f2d530..ce0daf8522687 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -181,8 +181,49 @@ def finalize(self, other, method=None, **kwargs): # reset Series._metadata = _metadata - Series.__finalize__ = _finalize + Series.__finalize__ = _finalize # FIXME: use monkeypatch + @pytest.mark.parametrize( + "s", + [ + Series([np.arange(5)]), + pd.date_range("1/1/2011", periods=24, freq="H"), + pd.Series(range(5), index=pd.date_range("2017", periods=5)), + ], + ) + @pytest.mark.parametrize("shift_size", [0, 1, 2]) + def test_shift_always_copy(self, s, shift_size): + # GH22397 + assert s.shift(shift_size) is not s + + @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) + def test_datetime_shift_always_copy(self, move_by_freq): + # GH22397 + s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) + assert s.shift(freq=move_by_freq) is not s + + +class TestSeries2: + # moved from Generic + def test_get_default(self): + + # GH#7725 + d0 = ["a", "b", "c", "d"] + d1 = np.arange(4, dtype="int64") + others = ["e", 10] + + for data, index in ((d0, d1), (d1, d0)): + s = Series(data, index=index) + for i, d in zip(index, data): + assert s.get(i) == d + assert s.get(i, d) == d + assert s.get(i, "z") == d + for other in others: + assert s.get(other, "z") == "z" + assert s.get(other, other) == other + + +class TestToXArray: @pytest.mark.skipif( not _XARRAY_INSTALLED or _XARRAY_INSTALLED @@ -242,22 +283,3 @@ def test_to_xarray(self): tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, DataArray) tm.assert_series_equal(result.to_series(), s) - - @pytest.mark.parametrize( - "s", - [ - Series([np.arange(5)]), - pd.date_range("1/1/2011", periods=24, freq="H"), - pd.Series(range(5), index=pd.date_range("2017", periods=5)), - ], - ) - @pytest.mark.parametrize("shift_size", [0, 1, 2]) - def test_shift_always_copy(self, s, shift_size): - # GH22397 - assert s.shift(shift_size) is not s - - @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) - def test_datetime_shift_always_copy(self, move_by_freq): - # GH22397 - s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) - assert s.shift(freq=move_by_freq) is not s From 6ffad92e04c7a11983e3049e6633fc7399a7d5a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 Feb 2020 18:16:04 -0800 Subject: [PATCH 7/7] merge fixup --- pandas/tests/generic/test_frame.py | 3 --- pandas/tests/generic/test_generic.py | 31 ---------------------------- 2 files changed, 34 deletions(-) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 72e13db03f190..d8f4257566f84 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -198,9 +198,6 @@ def test_validate_bool_args(self): with pytest.raises(ValueError): super(DataFrame, df).drop("a", axis=1, inplace=value) - with pytest.raises(ValueError): - super(DataFrame, df).sort_index(inplace=value) - with pytest.raises(ValueError): super(DataFrame, df)._consolidate(inplace=value) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 5754f39be9c9e..d574660d21c0d 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -509,37 +509,6 @@ def test_truncate_out_of_bounds(self): self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) - def test_validate_bool_args(self): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - invalid_values = [1, "True", [1, 2, 3], 5.0] - - for value in invalid_values: - with pytest.raises(ValueError): - super(DataFrame, df).rename_axis( - mapper={"a": "x", "b": "y"}, axis=1, inplace=value - ) - - with pytest.raises(ValueError): - super(DataFrame, df).drop("a", axis=1, inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df)._consolidate(inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df).fillna(value=0, inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df).interpolate(inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df)._where(cond=df.a > 2, inplace=value) - - with pytest.raises(ValueError): - super(DataFrame, df).mask(cond=df.a > 2, inplace=value) - def test_copy_and_deepcopy(self): # GH 15444 for shape in [0, 1, 2]: