diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py similarity index 81% rename from pandas/tests/frame/test_duplicates.py rename to pandas/tests/frame/methods/test_drop_duplicates.py index d2a1fc43d2046..a7715d1f31673 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -3,95 +3,20 @@ import numpy as np import pytest -from pandas import DataFrame, Series +from pandas import DataFrame import pandas.util.testing as tm @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) -def test_duplicated_with_misspelled_column_name(subset): +def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) msg = re.escape("Index(['a'], dtype='object')") - with pytest.raises(KeyError, match=msg): - df.duplicated(subset) - with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) -@pytest.mark.slow -def test_duplicated_do_not_fail_on_wide_dataframes(): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = { - "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) - } - df = DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool Series as a result and don't fail during - # calculation. Actual values doesn't matter here, though usually it's all - # False in this case - assert isinstance(result, Series) - assert result.dtype == np.bool - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_keep(keep, expected): - df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) -def test_duplicated_subset(subset, keep): - df = DataFrame( - { - "A": [0, 1, 1, 2, 0], - "B": ["a", "b", "b", "c", "a"], - "C": [np.nan, 3, 3, None, np.nan], - } - ) - - if subset is None: - subset = list(df.columns) - elif isinstance(subset, str): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - - expected = df[subset].duplicated(keep=keep) - result = df.duplicated(keep=keep, subset=subset) - tm.assert_series_equal(result, expected) - - def test_drop_duplicates(): df = DataFrame( { @@ -188,17 +113,6 @@ def test_drop_duplicates(): assert df.duplicated(keep=keep).sum() == 0 -def test_duplicated_on_empty_frame(): - # GH 25184 - - df = DataFrame(columns=["a", "b"]) - dupes = df.duplicated("a") - - result = df[dupes] - expected = df.copy() - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_with_duplicate_column_names(): # GH17836 df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000000000..d5c28a416ffa7 --- /dev/null +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,100 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas.util.testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } + df = DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py new file mode 100644 index 0000000000000..0c15533c37f01 --- /dev/null +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas.util.testing as tm + + +class TestDataFramePctChange: + def test_pct_change_numeric(self): + # GH#11150 + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) + pnl.iat[1, 0] = np.nan + pnl.iat[1, 1] = np.nan + pnl.iat[2, 3] = 60 + + for axis in range(2): + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") + + tm.assert_frame_equal(result, expected) + + def test_pct_change(self, datetime_frame): + rs = datetime_frame.pct_change(fill_method=None) + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) + + rs = datetime_frame.pct_change(2) + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.fillna(method="bfill", limit=1) + tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + df = DataFrame({"a": s, "b": s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) + tm.assert_frame_equal(chg, edf) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): + # GH#7292 + rs_freq = datetime_frame.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_frame.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_frame_equal(rs_freq, rs_periods) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9ddb14470f6e4..a705fc89a813d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -893,24 +893,6 @@ def test_sum_bools(self): bools = isna(df) assert bools.sum(axis=1)[0] == 10 - # --------------------------------------------------------------------- - # Miscellanea - - def test_pct_change(self): - # GH#11150 - pnl = DataFrame( - [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] - ).astype(np.float64) - pnl.iat[1, 0] = np.nan - pnl.iat[1, 1] = np.nan - pnl.iat[2, 3] = 60 - - for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 - result = pnl.pct_change(axis=axis, fill_method="pad") - - tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- # Index of max / min diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 318b1c6add91e..60dce36312145 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -3,6 +3,7 @@ import re import sys import textwrap +import warnings import numpy as np import pytest @@ -29,17 +30,17 @@ class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty - foo = repr(DataFrame()) # noqa + repr(DataFrame()) # empty with index frame = DataFrame(index=np.arange(1000)) - foo = repr(frame) # noqa + repr(frame) def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed - foo = repr(float_string_frame) # noqa + repr(float_string_frame) float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow @@ -51,13 +52,13 @@ def test_repr_mixed_big(self): biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan - foo = repr(biggie) # noqa + repr(biggie) def test_repr(self, float_frame): buf = StringIO() # small one - foo = repr(float_frame) + repr(float_frame) float_frame.info(verbose=False, buf=buf) # even smaller @@ -68,7 +69,7 @@ def test_repr(self, float_frame): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) - foo = repr(no_index) # noqa + repr(no_index) # no columns or index DataFrame().info(buf=buf) @@ -97,7 +98,6 @@ def test_repr_big(self): def test_repr_unsortable(self, float_frame): # columns are not sortable - import warnings warn_filters = warnings.filters warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b9df3ce305dbc..9985468ac6cd8 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -27,62 +27,6 @@ def close_open_fixture(request): class TestDataFrameTimeSeriesMethods: - def test_pct_change(self, datetime_frame): - rs = datetime_frame.pct_change(fill_method=None) - tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) - - rs = datetime_frame.pct_change(2) - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_frame.pct_change(fill_method="bfill", limit=1) - filled = datetime_frame.fillna(method="bfill", limit=1) - tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_frame.pct_change(freq="5D") - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - df = DataFrame({"a": s, "b": s}) - - chg = df.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - edf = DataFrame({"a": expected, "b": expected}) - tm.assert_frame_equal(chg, edf) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, datetime_frame, freq, periods, fill_method, limit - ): - # GH 7292 - rs_freq = datetime_frame.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_frame.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_frame_equal(rs_freq, rs_periods) - - empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_frame_equal(rs_freq, rs_periods) - def test_frame_ctor_datetime64_column(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py new file mode 100644 index 0000000000000..9dd3045ad86d9 --- /dev/null +++ b/pandas/tests/series/methods/test_argsort.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp, isna +import pandas.util.testing as tm + + +class TestSeriesArgsort: + def _check_accum_op(self, name, ser, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + ) + + # with missing values + ts = ser.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + def test_argsort(self, datetime_series): + self._check_accum_op("argsort", datetime_series, check_dtype=False) + argsorted = datetime_series.argsort() + assert issubclass(argsorted.dtype.type, np.integer) + + # GH#2967 (introduced bug in 0.11-dev I think) + s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) + assert s.dtype == "datetime64[ns]" + shifted = s.shift(-1) + assert shifted.dtype == "datetime64[ns]" + assert isna(shifted[4]) + + result = s.argsort() + expected = Series(range(5), dtype="int64") + tm.assert_series_equal(result, expected) + + result = shifted.argsort() + expected = Series(list(range(4)) + [-1], dtype="int64") + tm.assert_series_equal(result, expected) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind="mergesort") + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") + + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type ," + r" found instead" + ) + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(qindexer, mindexer) + + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000..2c5dcd2c45171 --- /dev/null +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest + +from pandas import Categorical, Series +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) +def test_drop_duplicates(any_numpy_dtype, keep, expected): + tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) + + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +class TestSeriesDropDuplicates: + @pytest.mark.parametrize( + "dtype", + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], + ) + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + # Test case 1 + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, True]) + tm.assert_series_equal(tc1.duplicated(), expected) + tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, False]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc1.duplicated(keep=False), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + # Test case 2 + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(), expected) + tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, False, False, False]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(keep=False), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_bool(self, ordered_fixture): + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py new file mode 100644 index 0000000000000..36b3b559477a6 --- /dev/null +++ b/pandas/tests/series/methods/test_duplicated.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) +def test_duplicated_keep(keep, expected): + ser = Series(["a", "b", "b", "c", "a"], name="name") + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py new file mode 100644 index 0000000000000..abc5c498813ef --- /dev/null +++ b/pandas/tests/series/methods/test_pct_change.py @@ -0,0 +1,70 @@ +import numpy as np +import pytest + +from pandas import Series, date_range +import pandas.util.testing as tm + + +class TestSeriesPctChange: + def test_pct_change(self, datetime_series): + rs = datetime_series.pct_change(fill_method=None) + tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) + + rs = datetime_series.pct_change(2) + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.fillna(method="bfill", limit=1) + tm.assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_with_duplicate_axis(self): + # GH#28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + tm.assert_series_equal(chg, expected) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): + # GH#7292 + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=datetime_series.index, dtype=object) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_series_equal(rs_freq, rs_periods) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 17cf307a04d7f..16d058d88b4ad 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -6,65 +6,11 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import Categorical, DataFrame, MultiIndex, Series, Timestamp, isna +from pandas import DataFrame, MultiIndex, Series import pandas.util.testing as tm class TestSeriesAnalytics: - def test_argsort(self, datetime_series): - self._check_accum_op("argsort", datetime_series, check_dtype=False) - argsorted = datetime_series.argsort() - assert issubclass(argsorted.dtype.type, np.integer) - - # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) - assert s.dtype == "datetime64[ns]" - shifted = s.shift(-1) - assert shifted.dtype == "datetime64[ns]" - assert isna(shifted[4]) - - result = s.argsort() - expected = Series(range(5), dtype="int64") - tm.assert_series_equal(result, expected) - - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype="int64") - tm.assert_series_equal(result, expected) - - def test_argsort_stable(self): - s = Series(np.random.randint(0, 100, size=10000)) - mindexer = s.argsort(kind="mergesort") - qindexer = s.argsort() - - mexpected = np.argsort(s.values, kind="mergesort") - qexpected = np.argsort(s.values, kind="quicksort") - - tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) - msg = ( - r"ndarray Expected type ," - r" found instead" - ) - with pytest.raises(AssertionError, match=msg): - tm.assert_numpy_array_equal(qindexer, mindexer) - - def _check_accum_op(self, name, datetime_series_, check_dtype=True): - func = getattr(np, name) - tm.assert_numpy_array_equal( - func(datetime_series_).values, - func(np.array(datetime_series_)), - check_dtype=check_dtype, - ) - - # with missing values - ts = datetime_series_.copy() - ts[::2] = np.NaN - - result = func(ts)[1::2] - expected = func(np.array(ts.dropna())) - - tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) - def test_compress(self): cond = [True, False, True, False, False] s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") @@ -397,100 +343,3 @@ def test_validate_stat_keepdims(self): ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) - - -class TestCategoricalSeriesAnalytics: - @pytest.mark.parametrize( - "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], - ) - def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): - cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) - - # Test case 1 - input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc1 values are seemingly-random - if not (np.array(tc1) == input1).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, True]) - tm.assert_series_equal(tc1.duplicated(), expected) - tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep="last"), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc1.duplicated(keep=False), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - # Test case 2 - input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc2 values are seemingly-random - if not (np.array(tc2) == input2).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(), expected) - tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep="last"), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(keep=False), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - def test_drop_duplicates_categorical_bool(self, ordered_fixture): - tc = Series( - Categorical( - [True, False, True, False], - categories=[True, False], - ordered=ordered_fixture, - ) - ) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc.duplicated(), expected) - tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, False, False]) - tm.assert_series_equal(tc.duplicated(keep="last"), expected) - tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, True, True]) - tm.assert_series_equal(tc.duplicated(keep=False), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index f8cf6b6a54d14..47ffb2259df6a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -112,10 +112,6 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_argsort_preserve_name(self, datetime_series): - result = datetime_series.argsort() - assert result.name == datetime_series.name - def test_sort_index_name(self, datetime_series): result = datetime_series.sort_index(ascending=False) assert result.name == datetime_series.name diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 666354e70bdd4..57d919ccb89ec 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -6,7 +6,7 @@ import pandas.util.testing as tm -def test_value_counts_nunique(): +def test_nunique(): # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan @@ -90,72 +90,3 @@ def __ne__(self, other): s.is_unique captured = capsys.readouterr() assert len(captured.err) == 0 - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, False, False, True, True, False])), - ("last", Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])), - ], -) -def test_drop_duplicates(any_numpy_dtype, keep, expected): - tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) - - if tc.dtype == "bool": - pytest.skip("tested separately in test_drop_duplicates_bool") - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, True])), - ("last", Series([True, True, False, False])), - (False, Series([True, True, True, True])), - ], -) -def test_drop_duplicates_bool(keep, expected): - tc = Series([True, False, True, False]) - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True], name="name")), - ("last", Series([True, True, False, False, False], name="name")), - (False, Series([True, True, True, False, True], name="name")), - ], -) -def test_duplicated_keep(keep, expected): - s = Series(["a", "b", "b", "c", "a"], name="name") - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - s = Series([np.nan, 3, 3, None, np.nan], dtype=object) - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index b9bd7744d3f9c..c3e5e8b975cda 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -76,69 +76,6 @@ def test_asfreq_datetimeindex_empty_series(self): result = Series([3], index=index.copy()).asfreq("H") tm.assert_index_equal(expected.index, result.index) - def test_pct_change(self, datetime_series): - rs = datetime_series.pct_change(fill_method=None) - tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) - - rs = datetime_series.pct_change(2) - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_series.pct_change(fill_method="bfill", limit=1) - filled = datetime_series.fillna(method="bfill", limit=1) - tm.assert_series_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_series.pct_change(freq="5D") - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_with_duplicate_axis(self): - # GH 28664 - common_idx = date_range("2019-11-14", periods=5, freq="D") - result = Series(range(5), common_idx).pct_change(freq="B") - - # the reason that the expected should be like this is documented at PR 28681 - expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) - - tm.assert_series_equal(result, expected) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - chg = s.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - tm.assert_series_equal(chg, expected) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, freq, periods, fill_method, limit, datetime_series - ): - # GH 7292 - rs_freq = datetime_series.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_series.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_series_equal(rs_freq, rs_periods) - - empty_ts = Series(index=datetime_series.index, dtype=object) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_series_equal(rs_freq, rs_periods) - def test_autocorr(self, datetime_series): # Just run the function corr1 = datetime_series.autocorr()