diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py new file mode 100644 index 0000000000000..48444e909ee01 --- /dev/null +++ b/pandas/tests/frame/methods/test_clip.py @@ -0,0 +1,157 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas.util.testing as tm + + +class TestDataFrameClip: + def test_clip(self, float_frame): + median = float_frame.median().median() + original = float_frame.copy() + + double = float_frame.clip(upper=median, lower=median) + assert not (double.values != median).any() + + # Verify that float_frame was not changed inplace + assert (float_frame.values == original.values).all() + + def test_inplace_clip(self, float_frame): + # GH#15388 + median = float_frame.median().median() + frame_copy = float_frame.copy() + + frame_copy.clip(upper=median, lower=median, inplace=True) + assert not (frame_copy.values != median).any() + + def test_dataframe_clip(self): + # GH#2747 + df = DataFrame(np.random.randn(1000, 2)) + + for lb, ub in [(-1, 1), (1, -1)]: + clipped_df = df.clip(lb, ub) + + lb, ub = min(lb, ub), max(ub, lb) + lb_mask = df.values <= lb + ub_mask = df.values >= ub + mask = ~lb_mask & ~ub_mask + assert (clipped_df.values[lb_mask] == lb).all() + assert (clipped_df.values[ub_mask] == ub).all() + assert (clipped_df.values[mask] == df.values[mask]).all() + + def test_clip_mixed_numeric(self): + # TODO(jreback) + # clip on mixed integer or floats + # with integer clippers coerces to float + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) + result = df.clip(1, 2) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) + tm.assert_frame_equal(result, expected, check_like=True) + + # GH#24162, clipping now preserves numeric types per column + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) + expected = df.dtypes + result = df.clip(upper=3).dtypes + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + def test_clip_against_series(self, inplace): + # GH#6966 + + df = DataFrame(np.random.randn(1000, 2)) + lb = Series(np.random.randn(1000)) + ub = lb + 1 + + original = df.copy() + clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) + + if inplace: + clipped_df = df + + for i in range(2): + lb_mask = original.iloc[:, i] <= lb + ub_mask = original.iloc[:, i] >= ub + mask = ~lb_mask & ~ub_mask + + result = clipped_df.loc[lb_mask, i] + tm.assert_series_equal(result, lb[lb_mask], check_names=False) + assert result.name == i + + result = clipped_df.loc[ub_mask, i] + tm.assert_series_equal(result, ub[ub_mask], check_names=False) + assert result.name == i + + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + # GH#15390 + original = simple_frame.copy(deep=True) + + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) + + expected = pd.DataFrame(res, columns=original.columns, index=original.index) + if inplace: + result = original + tm.assert_frame_equal(result, expected, check_exact=True) + + @pytest.mark.parametrize("axis", [0, 1, None]) + def test_clip_against_frame(self, axis): + df = DataFrame(np.random.randn(1000, 2)) + lb = DataFrame(np.random.randn(1000, 2)) + ub = lb + 1 + + clipped_df = df.clip(lb, ub, axis=axis) + + lb_mask = df <= lb + ub_mask = df >= ub + mask = ~lb_mask & ~ub_mask + + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) + + def test_clip_against_unordered_columns(self): + # GH#20911 + df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) + df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) + result_upper = df1.clip(lower=0, upper=df2) + expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) + result_lower = df1.clip(lower=df3, upper=3) + expected_lower = df1.clip(lower=df3[df1.columns], upper=3) + result_lower_upper = df1.clip(lower=df3, upper=df2) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) + tm.assert_frame_equal(result_upper, expected_upper) + tm.assert_frame_equal(result_lower, expected_lower) + tm.assert_frame_equal(result_lower_upper, expected_lower_upper) + + def test_clip_with_na_args(self, float_frame): + """Should process np.nan argument as None """ + # GH#17276 + tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) + + # GH#19992 + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame( + {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py new file mode 100644 index 0000000000000..09510fc931546 --- /dev/null +++ b/pandas/tests/frame/methods/test_describe.py @@ -0,0 +1,333 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas.util.testing as tm + + +class TestDataFrameDescribe: + def test_describe_bool_in_mixed_frame(self): + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) + + # Integer data are included in .describe() output, + # Boolean and string data are not. + result = df.describe() + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + # Top value is a boolean value that is False + result = df.describe(include=["bool"]) + + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) + tm.assert_frame_equal(result, expected) + + def test_describe_empty_object(self): + # GH#27183 + df = pd.DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].describe() + tm.assert_frame_equal(result, expected) + + def test_describe_bool_frame(self): + # GH#13891 + df = pd.DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) + result = df.describe() + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) + result = df.describe() + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) + result = df.describe() + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_categorical(self): + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + cat = df + + # Categoricals should not show up together with numerical columns + result = cat.describe() + assert len(result.columns) == 1 + + # In a frame, describe() for the cat should be the same as for string + # arrays (count, unique, top, freq) + + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) + s = Series(cat) + result = s.describe() + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "b", "c", "c"])) + df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) + result = df3.describe() + tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + + def test_describe_empty_categorical_column(self): + # GH#26397 + # Ensure the index of an an empty categorical DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) + tm.assert_frame_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) + + def test_describe_categorical_columns(self): + # GH#11558 + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) + result = df.describe() + + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) + + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) + + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) + df.columns = columns + result = df.describe() + + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + expected.columns = exp_columns + tm.assert_frame_equal(result, expected) + assert result.columns.freq == "MS" + assert result.columns.tz == expected.columns.tz + + def test_describe_timedelta_values(self): + # GH#6145 + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + + result = df.describe() + tm.assert_frame_equal(result, expected) + + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) + assert repr(result) == exp_repr + + def test_describe_tz_values(self, tz_naive_fixture): + # GH#21332 + tz = tz_naive_fixture + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [ + 5, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2, + 1.581139, + 0, + 1, + 2, + 3, + 4, + ], + "s2": [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + }, + index=[ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ], + ) + result = df.describe(include="all") + tm.assert_frame_equal(result, expected) + + def test_describe_percentiles_integer_idx(self): + # GH#26660 + df = pd.DataFrame({"x": [1]}) + pct = np.linspace(0, 1, 10 + 1) + result = df.describe(percentiles=pct) + + expected = DataFrame( + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py new file mode 100644 index 0000000000000..5d7dc5c843ec1 --- /dev/null +++ b/pandas/tests/frame/methods/test_isin.py @@ -0,0 +1,186 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas.util.testing as tm + + +class TestDataFrameIsIn: + def test_isin(self): + # GH#4211 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # GH#16991 + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + expected = DataFrame(False, df.index, df.columns) + + result = df.isin(empty) + tm.assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + def test_isin_with_string_scalar(self): + # GH#4763 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + with pytest.raises(TypeError): + df.isin("a") + + with pytest.raises(TypeError): + df.isin("aaa") + + def test_isin_df(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected["A"].loc[[1, 3]] = True + expected["B"].loc[[0, 2]] = True + tm.assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ["A", "C"] + result = df1.isin(df2) + expected["B"] = False + tm.assert_frame_equal(result, expected) + + def test_isin_tuples(self): + # GH#16394 + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) + with pytest.raises(ValueError): + df1.isin(df2) + + # just index duped + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) + with pytest.raises(ValueError): + df1.isin(df2) + + # cols and index: + df2.columns = ["B", "B"] + with pytest.raises(ValueError): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + tm.assert_frame_equal(result, expected) + + def test_isin_against_series(self): + df = pd.DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected["A"].loc["a"] = True + expected.loc["d"] = True + result = df.isin(s) + tm.assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(np.bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=["A", "B"], index=idx) + + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + def test_isin_empty_datetimelike(self): + # GH#15473 + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) + df3 = DataFrame() + + expected = DataFrame({"date": [False, False]}) + + result = df1_ts.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_ts.isin(df3) + tm.assert_frame_equal(result, expected) + + result = df1_td.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_td.isin(df3) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py new file mode 100644 index 0000000000000..71843053cf3a8 --- /dev/null +++ b/pandas/tests/frame/methods/test_transpose.py @@ -0,0 +1,43 @@ +import pandas as pd +import pandas.util.testing as tm + + +class TestTranspose: + def test_transpose_tzaware_1col_single_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df = pd.DataFrame(dti) + assert (df.dtypes == dti.dtype).all() + res = df.T + assert (res.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_single_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df3 = pd.DataFrame({"A": dti, "B": dti}) + assert (df3.dtypes == dti.dtype).all() + res3 = df3.T + assert (res3.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_mixed_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + df4 = pd.DataFrame({"A": dti, "B": dti2}) + assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() + assert (df4.T.dtypes == object).all() + tm.assert_frame_equal(df4.T.T, df4) + + def test_transpose_object_to_tzaware_mixed_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + # mixed all-tzaware dtypes + df2 = pd.DataFrame([dti, dti2]) + assert (df2.dtypes == object).all() + res2 = df2.T + assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ee9329da4e5e1..1a241cd72ec43 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -530,335 +530,6 @@ def test_corrwith_kendall(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) - # --------------------------------------------------------------------- - # Describe - - def test_bool_describe_in_mixed_frame(self): - df = DataFrame( - { - "string_data": ["a", "b", "c", "d", "e"], - "bool_data": [True, True, False, False, False], - "int_data": [10, 20, 30, 40, 50], - } - ) - - # Integer data are included in .describe() output, - # Boolean and string data are not. - result = df.describe() - expected = DataFrame( - {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - # Top value is a boolean value that is False - result = df.describe(include=["bool"]) - - expected = DataFrame( - {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] - ) - tm.assert_frame_equal(result, expected) - - def test_describe_empty_object(self): - # https://github.com/pandas-dev/pandas/issues/27183 - df = pd.DataFrame({"A": [None, None]}, dtype=object) - result = df.describe() - expected = pd.DataFrame( - {"A": [0, 0, np.nan, np.nan]}, - dtype=object, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - result = df.iloc[:0].describe() - tm.assert_frame_equal(result, expected) - - def test_describe_bool_frame(self): - # GH 13891 - df = pd.DataFrame( - { - "bool_data_1": [False, False, True, True], - "bool_data_2": [False, True, True, True], - } - ) - result = df.describe() - expected = DataFrame( - {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - { - "bool_data": [False, False, True, True, False], - "int_data": [0, 1, 2, 3, 4], - } - ) - result = df.describe() - expected = DataFrame( - {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} - ) - result = df.describe() - expected = DataFrame( - {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - def test_describe_categorical(self): - df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=["value"], ascending=True) - df["value_group"] = pd.cut( - df.value, range(0, 10500, 500), right=False, labels=cat_labels - ) - cat = df - - # Categoricals should not show up together with numerical columns - result = cat.describe() - assert len(result.columns) == 1 - - # In a frame, describe() for the cat should be the same as for string - # arrays (count, unique, top, freq) - - cat = Categorical( - ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True - ) - s = Series(cat) - result = s.describe() - expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) - tm.assert_series_equal(result, expected) - - cat = Series(Categorical(["a", "b", "c", "c"])) - df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) - result = df3.describe() - tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - - def test_describe_empty_categorical_column(self): - # GH 26397 - # Ensure the index of an an empty categorical DataFrame column - # also contains (count, unique, top, freq) - df = pd.DataFrame({"empty_col": Categorical([])}) - result = df.describe() - expected = DataFrame( - {"empty_col": [0, 0, np.nan, np.nan]}, - index=["count", "unique", "top", "freq"], - dtype="object", - ) - tm.assert_frame_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2, 0]) - assert np.isnan(result.iloc[3, 0]) - - def test_describe_categorical_columns(self): - # GH 11558 - columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") - df = DataFrame( - { - "int1": [10, 20, 30, 40, 50], - "int2": [10, 20, 30, 40, 50], - "obj": ["A", 0, None, "X", 1], - }, - columns=columns, - ) - result = df.describe() - - exp_columns = pd.CategoricalIndex( - ["int1", "int2"], - categories=["int1", "int2", "obj"], - ordered=True, - name="XXX", - ) - expected = DataFrame( - { - "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], - "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - columns=exp_columns, - ) - - tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, expected.columns.values) - - def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01", "2011-03-01"], - freq="MS", - tz="US/Eastern", - name="XXX", - ) - df = DataFrame( - { - 0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ["A", 0, None, "X", 1], - } - ) - df.columns = columns - result = df.describe() - - exp_columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" - ) - expected = DataFrame( - { - 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - expected.columns = exp_columns - tm.assert_frame_equal(result, expected) - assert result.columns.freq == "MS" - assert result.columns.tz == expected.columns.tz - - def test_describe_timedelta_values(self): - # GH 6145 - t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) - df = pd.DataFrame({"t1": t1, "t2": t2}) - - expected = DataFrame( - { - "t1": [ - 5, - pd.Timedelta("3 days"), - df.iloc[:, 0].std(), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - pd.Timedelta("4 days"), - pd.Timedelta("5 days"), - ], - "t2": [ - 5, - pd.Timedelta("3 hours"), - df.iloc[:, 1].std(), - pd.Timedelta("1 hours"), - pd.Timedelta("2 hours"), - pd.Timedelta("3 hours"), - pd.Timedelta("4 hours"), - pd.Timedelta("5 hours"), - ], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - - result = df.describe() - tm.assert_frame_equal(result, expected) - - exp_repr = ( - " t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00" - ) - assert repr(result) == exp_repr - - def test_describe_tz_values(self, tz_naive_fixture): - # GH 21332 - tz = tz_naive_fixture - s1 = Series(range(5)) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) - - expected = DataFrame( - { - "s1": [ - 5, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 2, - 1.581139, - 0, - 1, - 2, - 3, - 4, - ], - "s2": [ - 5, - 5, - s2.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], - }, - index=[ - "count", - "unique", - "top", - "freq", - "first", - "last", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ], - ) - result = df.describe(include="all") - tm.assert_frame_equal(result, expected) - - def test_describe_percentiles_integer_idx(self): - # Issue 26660 - df = pd.DataFrame({"x": [1]}) - pct = np.linspace(0, 1, 10 + 1) - result = df.describe(percentiles=pct) - - expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, - index=[ - "count", - "mean", - "std", - "min", - "0%", - "10%", - "20%", - "30%", - "40%", - "50%", - "60%", - "70%", - "80%", - "90%", - "100%", - "max", - ], - ) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Reductions @@ -1781,187 +1452,6 @@ def test_any_all_level_axis_none_raises(self, method): with pytest.raises(ValueError, match=xpr): getattr(df, method)(axis=None, level="out") - # ---------------------------------------------------------------------- - # Isin - - def test_isin(self): - # GH 4211 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - other = ["a", "b", "c"] - - result = df.isin(other) - expected = DataFrame([df.loc[s].isin(other) for s in df.index]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) - def test_isin_empty(self, empty): - # GH 16991 - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - expected = DataFrame(False, df.index, df.columns) - - result = df.isin(empty) - tm.assert_frame_equal(result, expected) - - def test_isin_dict(self): - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - d = {"A": ["a"]} - - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - # non unique columns - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - df.columns = ["A", "A"] - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - def test_isin_with_string_scalar(self): - # GH 4763 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - with pytest.raises(TypeError): - df.isin("a") - - with pytest.raises(TypeError): - df.isin("aaa") - - def test_isin_df(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) - expected = DataFrame(False, df1.index, df1.columns) - result = df1.isin(df2) - expected["A"].loc[[1, 3]] = True - expected["B"].loc[[0, 2]] = True - tm.assert_frame_equal(result, expected) - - # partial overlapping columns - df2.columns = ["A", "C"] - result = df1.isin(df2) - expected["B"] = False - tm.assert_frame_equal(result, expected) - - def test_isin_tuples(self): - # GH 16394 - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) - df["C"] = list(zip(df["A"], df["B"])) - result = df["C"].isin([(1, "a")]) - tm.assert_series_equal(result, Series([True, False, False], name="C")) - - def test_isin_df_dupe_values(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - # just cols duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) - with pytest.raises(ValueError): - df1.isin(df2) - - # just index duped - df2 = DataFrame( - [[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=["A", "B"], - index=[0, 0, 1, 1], - ) - with pytest.raises(ValueError): - df1.isin(df2) - - # cols and index: - df2.columns = ["B", "B"] - with pytest.raises(ValueError): - df1.isin(df2) - - def test_isin_dupe_self(self): - other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) - df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) - result = df.isin(other) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected.loc[0] = True - expected.iloc[1, 1] = True - tm.assert_frame_equal(result, expected) - - def test_isin_against_series(self): - df = pd.DataFrame( - {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] - ) - s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected["A"].loc["a"] = True - expected.loc["d"] = True - result = df.isin(s) - tm.assert_frame_equal(result, expected) - - def test_isin_multiIndex(self): - idx = MultiIndex.from_tuples( - [ - (0, "a", "foo"), - (0, "a", "bar"), - (0, "b", "bar"), - (0, "b", "baz"), - (2, "a", "foo"), - (2, "a", "bar"), - (2, "c", "bar"), - (2, "c", "baz"), - (1, "b", "foo"), - (1, "b", "bar"), - (1, "c", "bar"), - (1, "c", "baz"), - ] - ) - df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) - df2 = DataFrame( - { - "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], - "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], - } - ) - # against regular index - expected = DataFrame(False, index=df1.index, columns=df1.columns) - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - df2.index = idx - expected = df2.values.astype(np.bool) - expected[:, 1] = ~expected[:, 1] - expected = DataFrame(expected, columns=["A", "B"], index=idx) - - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - def test_isin_empty_datetimelike(self): - # GH 15473 - df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) - df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) - df2 = DataFrame({"date": []}) - df3 = DataFrame() - - expected = DataFrame({"date": [False, False]}) - - result = df1_ts.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_ts.isin(df3) - tm.assert_frame_equal(result, expected) - - result = df1_td.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_td.isin(df3) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Rounding @@ -2174,158 +1664,6 @@ def test_round_interval_category_columns(self): expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns) tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- - # Clip - - def test_clip(self, float_frame): - median = float_frame.median().median() - original = float_frame.copy() - - double = float_frame.clip(upper=median, lower=median) - assert not (double.values != median).any() - - # Verify that float_frame was not changed inplace - assert (float_frame.values == original.values).all() - - def test_inplace_clip(self, float_frame): - # GH 15388 - median = float_frame.median().median() - frame_copy = float_frame.copy() - - frame_copy.clip(upper=median, lower=median, inplace=True) - assert not (frame_copy.values != median).any() - - def test_dataframe_clip(self): - # GH 2747 - df = DataFrame(np.random.randn(1000, 2)) - - for lb, ub in [(-1, 1), (1, -1)]: - clipped_df = df.clip(lb, ub) - - lb, ub = min(lb, ub), max(ub, lb) - lb_mask = df.values <= lb - ub_mask = df.values >= ub - mask = ~lb_mask & ~ub_mask - assert (clipped_df.values[lb_mask] == lb).all() - assert (clipped_df.values[ub_mask] == ub).all() - assert (clipped_df.values[mask] == df.values[mask]).all() - - def test_clip_mixed_numeric(self): - # TODO(jreback) - # clip on mixed integer or floats - # with integer clippers coerces to float - df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) - result = df.clip(1, 2) - expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) - tm.assert_frame_equal(result, expected, check_like=True) - - # GH 24162, clipping now preserves numeric types per column - df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) - expected = df.dtypes - result = df.clip(upper=3).dtypes - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("inplace", [True, False]) - def test_clip_against_series(self, inplace): - # GH 6966 - - df = DataFrame(np.random.randn(1000, 2)) - lb = Series(np.random.randn(1000)) - ub = lb + 1 - - original = df.copy() - clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) - - if inplace: - clipped_df = df - - for i in range(2): - lb_mask = original.iloc[:, i] <= lb - ub_mask = original.iloc[:, i] >= ub - mask = ~lb_mask & ~ub_mask - - result = clipped_df.loc[lb_mask, i] - tm.assert_series_equal(result, lb[lb_mask], check_names=False) - assert result.name == i - - result = clipped_df.loc[ub_mask, i] - tm.assert_series_equal(result, ub[ub_mask], check_names=False) - assert result.name == i - - tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) - @pytest.mark.parametrize( - "axis,res", - [ - (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), - (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), - ], - ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): - # GH 15390 - original = simple_frame.copy(deep=True) - - result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - - expected = pd.DataFrame(res, columns=original.columns, index=original.index) - if inplace: - result = original - tm.assert_frame_equal(result, expected, check_exact=True) - - @pytest.mark.parametrize("axis", [0, 1, None]) - def test_clip_against_frame(self, axis): - df = DataFrame(np.random.randn(1000, 2)) - lb = DataFrame(np.random.randn(1000, 2)) - ub = lb + 1 - - clipped_df = df.clip(lb, ub, axis=axis) - - lb_mask = df <= lb - ub_mask = df >= ub - mask = ~lb_mask & ~ub_mask - - tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) - tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) - tm.assert_frame_equal(clipped_df[mask], df[mask]) - - def test_clip_against_unordered_columns(self): - # GH 20911 - df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) - df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) - df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) - result_upper = df1.clip(lower=0, upper=df2) - expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) - result_lower = df1.clip(lower=df3, upper=3) - expected_lower = df1.clip(lower=df3[df1.columns], upper=3) - result_lower_upper = df1.clip(lower=df3, upper=df2) - expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) - tm.assert_frame_equal(result_upper, expected_upper) - tm.assert_frame_equal(result_lower, expected_lower) - tm.assert_frame_equal(result_lower_upper, expected_lower_upper) - - def test_clip_with_na_args(self, float_frame): - """Should process np.nan argument as None """ - # GH 17276 - tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) - tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) - - # GH 19992 - df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - - result = df.clip(lower=[4, 5, np.nan], axis=0) - expected = DataFrame( - {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} - ) - tm.assert_frame_equal(result, expected) - - result = df.clip(lower=[4, 5, np.nan], axis=1) - expected = DataFrame( - {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} - ) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Matrix-like diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index f3e61dffb500d..a4f1c0688b144 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -888,44 +888,3 @@ def test_no_warning(self, all_arithmetic_operators): b = df["B"] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b, 0) - - -class TestTranspose: - def test_transpose_tzaware_1col_single_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - - df = pd.DataFrame(dti) - assert (df.dtypes == dti.dtype).all() - res = df.T - assert (res.dtypes == dti.dtype).all() - - def test_transpose_tzaware_2col_single_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - - df3 = pd.DataFrame({"A": dti, "B": dti}) - assert (df3.dtypes == dti.dtype).all() - res3 = df3.T - assert (res3.dtypes == dti.dtype).all() - - def test_transpose_tzaware_2col_mixed_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - dti2 = dti.tz_convert("US/Pacific") - - df4 = pd.DataFrame({"A": dti, "B": dti2}) - assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() - assert (df4.T.dtypes == object).all() - tm.assert_frame_equal(df4.T.T, df4) - - def test_transpose_object_to_tzaware_mixed_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - dti2 = dti.tz_convert("US/Pacific") - - # mixed all-tzaware dtypes - df2 = pd.DataFrame([dti, dti2]) - assert (df2.dtypes == object).all() - res2 = df2.T - assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py new file mode 100644 index 0000000000000..c2bec2744583a --- /dev/null +++ b/pandas/tests/series/methods/test_clip.py @@ -0,0 +1,99 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp, isna, notna +import pandas.util.testing as tm + + +class TestSeriesClip: + def test_clip(self, datetime_series): + val = datetime_series.median() + + assert datetime_series.clip(lower=val).min() == val + assert datetime_series.clip(upper=val).max() == val + + result = datetime_series.clip(-0.5, 0.5) + expected = np.clip(datetime_series, -0.5, 0.5) + tm.assert_series_equal(result, expected) + assert isinstance(expected, Series) + + def test_clip_types_and_nulls(self): + + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] + + for s in sers: + thresh = s[2] + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) + assert lower[notna(lower)].min() == thresh + assert upper[notna(upper)].max() == thresh + assert list(isna(s)) == list(isna(lower)) + assert list(isna(s)) == list(isna(upper)) + + def test_clip_with_na_args(self): + """Should process np.nan argument as None """ + # GH#17276 + s = Series([1, 2, 3]) + + tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) + tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + + # GH#19992 + tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) + tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) + + def test_clip_against_series(self): + # GH#6966 + + s = Series([1.0, 1.0, 4.0]) + + lower = Series([1.0, 2.0, 3.0]) + upper = Series([1.5, 2.5, 3.5]) + + tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) + tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) + def test_clip_against_list_like(self, inplace, upper): + # GH#15390 + original = pd.Series([5, 6, 7]) + result = original.clip(upper=upper, inplace=inplace) + expected = pd.Series([1, 2, 3]) + + if inplace: + result = original + tm.assert_series_equal(result, expected, check_exact=True) + + def test_clip_with_datetimes(self): + # GH#11838 + # naive and tz-aware datetimes + + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) + result = s.clip(upper=t) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) + tm.assert_series_equal(result, expected) + + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) + result = s.clip(upper=t) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py new file mode 100644 index 0000000000000..ed412e7da3d43 --- /dev/null +++ b/pandas/tests/series/methods/test_describe.py @@ -0,0 +1,69 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +import pandas.util.testing as tm + + +class TestSeriesDescribe: + def test_describe(self): + s = Series([0, 1, 2, 3, 4], name="int_data") + result = s.describe() + expected = Series( + [5, 2, s.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + s = Series([True, True, False, False, False], name="bool_data") + result = s.describe() + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + s = Series(["a", "a", "b", "c", "d"], name="str_data") + result = s.describe() + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + s = Series([None, None], dtype=object) + result = s.describe() + expected = Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + + result = s[:0].describe() + tm.assert_series_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2]) + assert np.isnan(result.iloc[3]) + + def test_describe_with_tz(self, tz_naive_fixture): + # GH 21332 + tz = tz_naive_fixture + name = str(tz_naive_fixture) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + result = s.describe() + expected = Series( + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + name=name, + index=["count", "unique", "top", "freq", "first", "last"], + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py new file mode 100644 index 0000000000000..69b2f896aec52 --- /dev/null +++ b/pandas/tests/series/methods/test_isin.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, date_range +import pandas.util.testing as tm + + +class TestSeriesIsIn: + def test_isin(self): + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + + result = s.isin(["A", "C"]) + expected = Series([True, False, True, False, False, False, True, True]) + tm.assert_series_equal(result, expected) + + # GH#16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list("abcdefghijk" * 10 ** 5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 + + assert s.isin(in_list).sum() == 200000 + + def test_isin_with_string_scalar(self): + # GH#4763 + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[str\]" + ) + with pytest.raises(TypeError, match=msg): + s.isin("a") + + s = Series(["aaa", "b", "c"]) + with pytest.raises(TypeError, match=msg): + s.isin("aaa") + + def test_isin_with_i8(self): + # GH#5021 + + expected = Series([True, True, False, False, False]) + expected2 = Series([False, True, False, False, False]) + + # datetime64[ns] + s = Series(date_range("jan-01-2013", "jan-05-2013")) + + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + result = s.isin(s[0:2].values) + tm.assert_series_equal(result, expected) + + # fails on dtype conversion in the first place + result = s.isin(s[0:2].values.astype("datetime64[D]")) + tm.assert_series_equal(result, expected) + + result = s.isin([s[1]]) + tm.assert_series_equal(result, expected2) + + result = s.isin([np.datetime64(s[1])]) + tm.assert_series_equal(result, expected2) + + result = s.isin(set(s[0:2])) + tm.assert_series_equal(result, expected) + + # timedelta64[ns] + s = Series(pd.to_timedelta(range(5), unit="d")) + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # see GH#16991 + s = Series(["a", "b"]) + expected = Series([False, False]) + + result = s.isin(empty) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6b85714d06594..86931ae23caee 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -6,76 +6,13 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import Categorical, DataFrame, MultiIndex, Series, date_range, isna, notna +from pandas import Categorical, DataFrame, MultiIndex, Series, date_range, isna from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm class TestSeriesAnalytics: - def test_describe(self): - s = Series([0, 1, 2, 3, 4], name="int_data") - result = s.describe() - expected = Series( - [5, 2, s.std(), 0, 1, 2, 3, 4], - name="int_data", - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_series_equal(result, expected) - - s = Series([True, True, False, False, False], name="bool_data") - result = s.describe() - expected = Series( - [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] - ) - tm.assert_series_equal(result, expected) - - s = Series(["a", "a", "b", "c", "d"], name="str_data") - result = s.describe() - expected = Series( - [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] - ) - tm.assert_series_equal(result, expected) - - def test_describe_empty_object(self): - # https://github.com/pandas-dev/pandas/issues/27183 - s = pd.Series([None, None], dtype=object) - result = s.describe() - expected = pd.Series( - [0, 0, np.nan, np.nan], - dtype=object, - index=["count", "unique", "top", "freq"], - ) - tm.assert_series_equal(result, expected) - - result = s[:0].describe() - tm.assert_series_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2]) - assert np.isnan(result.iloc[3]) - - def test_describe_with_tz(self, tz_naive_fixture): - # GH 21332 - tz = tz_naive_fixture - name = str(tz_naive_fixture) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s = Series(date_range(start, end, tz=tz), name=name) - result = s.describe() - expected = Series( - [ - 5, - 5, - s.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - ], - name=name, - index=["count", "unique", "top", "freq", "first", "last"], - ) - tm.assert_series_equal(result, expected) - def test_argsort(self, datetime_series): self._check_accum_op("argsort", datetime_series, check_dtype=False) argsorted = datetime_series.argsort() @@ -534,172 +471,6 @@ def test_matmul(self): with pytest.raises(ValueError, match=msg): a.dot(b.T) - def test_clip(self, datetime_series): - val = datetime_series.median() - - assert datetime_series.clip(lower=val).min() == val - assert datetime_series.clip(upper=val).max() == val - - result = datetime_series.clip(-0.5, 0.5) - expected = np.clip(datetime_series, -0.5, 0.5) - tm.assert_series_equal(result, expected) - assert isinstance(expected, Series) - - def test_clip_types_and_nulls(self): - - sers = [ - Series([np.nan, 1.0, 2.0, 3.0]), - Series([None, "a", "b", "c"]), - Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), - ] - - for s in sers: - thresh = s[2] - lower = s.clip(lower=thresh) - upper = s.clip(upper=thresh) - assert lower[notna(lower)].min() == thresh - assert upper[notna(upper)].max() == thresh - assert list(isna(s)) == list(isna(lower)) - assert list(isna(s)) == list(isna(upper)) - - def test_clip_with_na_args(self): - """Should process np.nan argument as None """ - # GH # 17276 - s = Series([1, 2, 3]) - - tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) - - # GH #19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) - - def test_clip_against_series(self): - # GH #6966 - - s = Series([1.0, 1.0, 4.0]) - - lower = Series([1.0, 2.0, 3.0]) - upper = Series([1.5, 2.5, 3.5]) - - tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) - tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) - def test_clip_against_list_like(self, inplace, upper): - # GH #15390 - original = pd.Series([5, 6, 7]) - result = original.clip(upper=upper, inplace=inplace) - expected = pd.Series([1, 2, 3]) - - if inplace: - result = original - tm.assert_series_equal(result, expected, check_exact=True) - - def test_clip_with_datetimes(self): - - # GH 11838 - # naive and tz-aware datetimes - - t = Timestamp("2015-12-01 09:30:30") - s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) - result = s.clip(upper=t) - expected = Series( - [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] - ) - tm.assert_series_equal(result, expected) - - t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") - s = Series( - [ - Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), - Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), - ] - ) - result = s.clip(upper=t) - expected = Series( - [ - Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), - Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), - ] - ) - tm.assert_series_equal(result, expected) - - def test_isin(self): - s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - - result = s.isin(["A", "C"]) - expected = Series([True, False, True, False, False, False, True, True]) - tm.assert_series_equal(result, expected) - - # GH: 16012 - # This specific issue has to have a series over 1e6 in len, but the - # comparison array (in_list) must be large enough so that numpy doesn't - # do a manual masking trick that will avoid this issue altogether - s = Series(list("abcdefghijk" * 10 ** 5)) - # If numpy doesn't do the manual comparison/mask, these - # unorderable mixed types are what cause the exception in numpy - in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 - - assert s.isin(in_list).sum() == 200000 - - def test_isin_with_string_scalar(self): - # GH4763 - s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]" - ) - with pytest.raises(TypeError, match=msg): - s.isin("a") - - s = Series(["aaa", "b", "c"]) - with pytest.raises(TypeError, match=msg): - s.isin("aaa") - - def test_isin_with_i8(self): - # GH 5021 - - expected = Series([True, True, False, False, False]) - expected2 = Series([False, True, False, False, False]) - - # datetime64[ns] - s = Series(date_range("jan-01-2013", "jan-05-2013")) - - result = s.isin(s[0:2]) - tm.assert_series_equal(result, expected) - - result = s.isin(s[0:2].values) - tm.assert_series_equal(result, expected) - - # fails on dtype conversion in the first place - result = s.isin(s[0:2].values.astype("datetime64[D]")) - tm.assert_series_equal(result, expected) - - result = s.isin([s[1]]) - tm.assert_series_equal(result, expected2) - - result = s.isin([np.datetime64(s[1])]) - tm.assert_series_equal(result, expected2) - - result = s.isin(set(s[0:2])) - tm.assert_series_equal(result, expected) - - # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) - result = s.isin(s[0:2]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) - def test_isin_empty(self, empty): - # see gh-16991 - s = Series(["a", "b"]) - expected = Series([False, False]) - - result = s.isin(empty) - tm.assert_series_equal(expected, result) - def test_ptp(self): # GH21614 N = 1000