diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py new file mode 100644 index 0000000000000..72299ad6b2bf6 --- /dev/null +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -0,0 +1,211 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from string import ascii_lowercase + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.fixture +def df_duplicates(): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) + + +@pytest.fixture +def df_strings(): + return pd.DataFrame( + { + "a": np.random.permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.permutation(10).astype("float64"), + } + ) + + +@pytest.fixture +def df_main_dtypes(): + return pd.DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + +class TestNLargestNSmallest: + + # ---------------------------------------------------------------------- + # Top / bottom + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) + def test_nlargest_n(self, df_strings, nselect_method, n, order): + # GH#10393 + df = df_strings + if "b" in order: + + error_msg = ( + f"Column 'b' has dtype object, " + f"cannot use method '{nselect_method}' with this dtype" + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(n, order) + else: + ascending = nselect_method == "nsmallest" + result = getattr(df, nselect_method)(n, order) + expected = df.sort_values(order, ascending=ascending).head(n) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) + def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): + df = df_main_dtypes + col = columns[1] + error_msg = ( + f"Column '{col}' has dtype {df[col].dtype}, " + f"cannot use method '{nselect_method}' with this dtype" + ) + # escape some characters that may be in the repr + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(2, columns) + + def test_nlargest_all_dtypes(self, df_main_dtypes): + df = df_main_dtypes + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + def test_nlargest_duplicates_on_starter_columns(self): + # regression test for GH#22752 + + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) + + result = df.nlargest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_n_identical_values(self): + # GH#15297 + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) + + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): + # GH#13412 + + df = df_duplicates + result = df.nsmallest(n, order) + expected = df.sort_values(order).head(n) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(n, order) + expected = df.sort_values(order, ascending=False).head(n) + tm.assert_frame_equal(result, expected) + + def test_nlargest_duplicate_keep_all_ties(self): + # GH#16818 + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_multiindex_column_lookup(self): + # Check whether tuples are correctly treated as multi-level lookups. + # GH#23033 + df = pd.DataFrame( + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) + + # nsmallest + result = df.nsmallest(3, ("x", "a")) + expected = df.iloc[[2, 0, 3]] + tm.assert_frame_equal(result, expected) + + # nlargest + result = df.nlargest(3, ("x", "b")) + expected = df.iloc[[3, 2, 1]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 0653c9dc5f91b..ee9329da4e5e1 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1,7 +1,6 @@ from datetime import timedelta from decimal import Decimal import operator -from string import ascii_lowercase import warnings import numpy as np @@ -2442,194 +2441,16 @@ def test_matmul(self): with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) + # --------------------------------------------------------------------- + # Unsorted -@pytest.fixture -def df_duplicates(): - return pd.DataFrame( - {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, - index=[0, 0, 1, 1, 1], - ) - - -@pytest.fixture -def df_strings(): - return pd.DataFrame( - { - "a": np.random.permutation(10), - "b": list(ascii_lowercase[:10]), - "c": np.random.permutation(10).astype("float64"), - } - ) - - -@pytest.fixture -def df_main_dtypes(): - return pd.DataFrame( - { - "group": [1, 1, 2], - "int": [1, 2, 3], - "float": [4.0, 5.0, 6.0], - "string": list("abc"), - "category_string": pd.Series(list("abc")).astype("category"), - "category_int": [7, 8, 9], - "datetime": pd.date_range("20130101", periods=3), - "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), - }, - columns=[ - "group", - "int", - "float", - "string", - "category_string", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ], - ) - - -class TestNLargestNSmallest: - - # ---------------------------------------------------------------------- - # Top / bottom - @pytest.mark.parametrize( - "order", - [ - ["a"], - ["c"], - ["a", "b"], - ["a", "c"], - ["b", "a"], - ["b", "c"], - ["a", "b", "c"], - ["c", "a", "b"], - ["c", "b", "a"], - ["b", "c", "a"], - ["b", "a", "c"], - # dups! - ["b", "c", "c"], - ], - ) - @pytest.mark.parametrize("n", range(1, 11)) - def test_n(self, df_strings, nselect_method, n, order): - # GH 10393 - df = df_strings - if "b" in order: - - error_msg = ( - f"Column 'b' has dtype object, " - f"cannot use method '{nselect_method}' with this dtype" - ) - with pytest.raises(TypeError, match=error_msg): - getattr(df, nselect_method)(n, order) - else: - ascending = nselect_method == "nsmallest" - result = getattr(df, nselect_method)(n, order) - expected = df.sort_values(order, ascending=ascending).head(n) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "columns", [["group", "category_string"], ["group", "string"]] - ) - def test_n_error(self, df_main_dtypes, nselect_method, columns): - df = df_main_dtypes - col = columns[1] - error_msg = ( - f"Column '{col}' has dtype {df[col].dtype}, " - f"cannot use method '{nselect_method}' with this dtype" - ) - # escape some characters that may be in the repr - error_msg = ( - error_msg.replace("(", "\\(") - .replace(")", "\\)") - .replace("[", "\\[") - .replace("]", "\\]") - ) - with pytest.raises(TypeError, match=error_msg): - getattr(df, nselect_method)(2, columns) - - def test_n_all_dtypes(self, df_main_dtypes): - df = df_main_dtypes - df.nsmallest(2, list(set(df) - {"category_string", "string"})) - df.nlargest(2, list(set(df) - {"category_string", "string"})) - - @pytest.mark.parametrize( - "method,expected", - [ - ( - "nlargest", - pd.DataFrame( - {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] - ), - ), - ( - "nsmallest", - pd.DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] - ), - ), - ], - ) - def test_duplicates_on_starter_columns(self, method, expected): - # regression test for #22752 - - df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) - - result = getattr(df, method)(4, columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - def test_n_identical_values(self): - # GH 15297 - df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) - - result = df.nlargest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "order", - [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], - ) - @pytest.mark.parametrize("n", range(1, 6)) - def test_n_duplicate_index(self, df_duplicates, n, order): - # GH 13412 - - df = df_duplicates - result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) - tm.assert_frame_equal(result, expected) - - result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) - tm.assert_frame_equal(result, expected) - - def test_duplicate_keep_all_ties(self): - # GH 16818 - df = pd.DataFrame( - {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} - ) - result = df.nlargest(4, "a", keep="all") - expected = pd.DataFrame( - { - "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, - } - ) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(2, "a", keep="all") - expected = pd.DataFrame( - { - "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, - } - ) + def test_series_nat_conversion(self): + # GH 18521 + # Check rank does not mutate DataFrame + df = DataFrame(np.random.randn(10, 3), dtype="float64") + expected = df.copy() + df.rank() + result = df tm.assert_frame_equal(result, expected) def test_series_broadcasting(self): @@ -2644,30 +2465,3 @@ def test_series_broadcasting(self): df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) - - def test_series_nat_conversion(self): - # GH 18521 - # Check rank does not mutate DataFrame - df = DataFrame(np.random.randn(10, 3), dtype="float64") - expected = df.copy() - df.rank() - result = df - tm.assert_frame_equal(result, expected) - - def test_multiindex_column_lookup(self): - # Check whether tuples are correctly treated as multi-level lookups. - # GH 23033 - df = pd.DataFrame( - columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), - data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], - ) - - # nsmallest - result = df.nsmallest(3, ("x", "a")) - expected = df.iloc[[2, 0, 3]] - tm.assert_frame_equal(result, expected) - - # nlargest - result = df.nlargest(3, ("x", "b")) - expected = df.iloc[[3, 2, 1]] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py new file mode 100644 index 0000000000000..423b4ad78a78a --- /dev/null +++ b/pandas/tests/series/methods/test_nlargest.py @@ -0,0 +1,213 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas.util.testing as tm + +main_dtypes = [ + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", +] + + +@pytest.fixture +def s_main_dtypes(): + """ + A DataFrame with many dtypes + + * datetime + * datetimetz + * timedelta + * [u]int{8,16,32,64} + * float{32,64} + + The columns are the name of the dtype. + """ + df = pd.DataFrame( + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: + df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) + + return df + + +@pytest.fixture(params=main_dtypes) +def s_main_dtypes_split(request, s_main_dtypes): + """Each series in s_main_dtypes.""" + return s_main_dtypes[request.param] + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + ser = Series(vals, dtype=dtype) + result = getattr(ser, method)(3) + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected = ser.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + +class TestSeriesNLargestNSmallest: + @pytest.mark.parametrize( + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) + def test_nlargest_error(self, r): + dt = r.dtype + msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with pytest.raises(TypeError, match=msg): + method(arg) + + def test_nsmallest_nlargest(self, s_main_dtypes_split): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + ser = s_main_dtypes_split + + tm.assert_series_equal(ser.nsmallest(2), ser.iloc[[2, 1]]) + tm.assert_series_equal(ser.nsmallest(2, keep="last"), ser.iloc[[2, 3]]) + + empty = ser.iloc[0:0] + tm.assert_series_equal(ser.nsmallest(0), empty) + tm.assert_series_equal(ser.nsmallest(-1), empty) + tm.assert_series_equal(ser.nlargest(0), empty) + tm.assert_series_equal(ser.nlargest(-1), empty) + + tm.assert_series_equal(ser.nsmallest(len(ser)), ser.sort_values()) + tm.assert_series_equal(ser.nsmallest(len(ser) + 1), ser.sort_values()) + tm.assert_series_equal(ser.nlargest(len(ser)), ser.iloc[[4, 0, 1, 3, 2]]) + tm.assert_series_equal(ser.nlargest(len(ser) + 1), ser.iloc[[4, 0, 1, 3, 2]]) + + def test_nlargest_misc(self): + + ser = Series([3.0, np.nan, 1, 2, 5]) + tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]]) + tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]]) + + msg = 'keep must be either "first", "last"' + with pytest.raises(ValueError, match=msg): + ser.nsmallest(keep="invalid") + with pytest.raises(ValueError, match=msg): + ser.nlargest(keep="invalid") + + # GH#15297 + ser = Series([1] * 5, index=[1, 2, 3, 4, 5]) + expected_first = Series([1] * 3, index=[1, 2, 3]) + expected_last = Series([1] * 3, index=[5, 4, 3]) + + result = ser.nsmallest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nsmallest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + result = ser.nlargest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nlargest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + @pytest.mark.parametrize("n", range(1, 5)) + def test_nlargest_n(self, n): + + # GH 13412 + ser = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = ser.nlargest(n) + expected = ser.sort_values(ascending=False).head(n) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(n) + expected = ser.sort_values().head(n) + tm.assert_series_equal(result, expected) + + def test_nlargest_boundary_integer(self, nselect_method, any_int_dtype): + # GH#21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_nlargest_boundary_float(self, nselect_method, float_dtype): + # GH#21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_nlargest_boundary_datetimelike(self, nselect_method, dtype): + # GH#21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo("int64") + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + + def test_nlargest_duplicate_keep_all_ties(self): + # see GH#16818 + ser = Series([10, 9, 8, 7, 7, 7, 7, 6]) + result = ser.nlargest(4, keep="all") + expected = Series([10, 9, 8, 7, 7, 7, 7]) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(2, keep="all") + expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) + def test_nlargest_boolean(self, data, expected): + # GH#26154 : ensure True > False + ser = Series(data) + result = ser.nlargest(1) + expected = Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py new file mode 100644 index 0000000000000..0d6e9635579f0 --- /dev/null +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -0,0 +1,55 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +from pandas.api.types import is_scalar +import pandas.util.testing as tm + + +class TestSeriesSearchSorted: + def test_searchsorted(self): + ser = Series([1, 2, 3]) + + result = ser.searchsorted(1, side="left") + assert is_scalar(result) + assert result == 0 + + result = ser.searchsorted(1, side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted(30) + assert is_scalar(res) + assert res == 2 + + res = ser.searchsorted([30]) + exp = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_numeric_dtypes_vector(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted([91, 2e6]) + exp = np.array([3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_datetime64_scalar(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + val = Timestamp("20120102") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + + def test_searchsorted_datetime64_list(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + vals = [Timestamp("20120102"), Timestamp("20120104")] + res = ser.searchsorted(vals) + exp = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_sorter(self): + # GH8490 + ser = Series([3, 1, 2]) + res = ser.searchsorted([0, 3], sorter=np.argsort(ser)) + exp = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py new file mode 100644 index 0000000000000..15d895f44c7b2 --- /dev/null +++ b/pandas/tests/series/methods/test_value_counts.py @@ -0,0 +1,179 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Series +import pandas.util.testing as tm + + +class TestSeriesValueCounts: + def test_value_counts_datetime(self): + # most dtypes are tested in tests/base + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_datetime_tz(self): + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_period(self): + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.PeriodIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_ordered(self): + # most dtypes are tested in tests/base + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_not_ordered(self): + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical(self): + # GH#12835 + cats = Categorical(list("abcccb"), categories=list("cabd")) + ser = Series(cats, name="xxx") + res = ser.value_counts(sort=False) + + exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(sort=True) + + exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + # check object dtype handles the Series.name as the same + # (tested in tests/base) + ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") + res = ser.value_counts() + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + tm.assert_series_equal(res, exp) + + def test_value_counts_categorical_with_nan(self): + # see GH#9443 + + # sanity check + ser = Series(["a", "b", "a"], dtype="category") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # same Series via two different constructions --> same behaviour + series = [ + Series(["a", "b", None, "a", None, None], dtype="category"), + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), + ] + + for ser in series: + # None is a NaN value, so we exclude its count here + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # we don't exclude the count of None and sort by counts + exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + res = ser.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + res = ser.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 148c376eba752..6b85714d06594 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1,4 +1,3 @@ -from itertools import product import operator import numpy as np @@ -7,17 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - MultiIndex, - Series, - date_range, - isna, - notna, -) -from pandas.api.types import is_scalar +from pandas import Categorical, DataFrame, MultiIndex, Series, date_range, isna, notna from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm @@ -637,36 +626,6 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - def test_cummethods_bool(self): - # GH 6270 - - a = pd.Series([False, False, False, True, True, False, False]) - b = ~a - c = pd.Series([False] * len(b)) - d = ~c - methods = { - "cumsum": np.cumsum, - "cumprod": np.cumprod, - "cummin": np.minimum.accumulate, - "cummax": np.maximum.accumulate, - } - args = product((a, b, c, d), methods) - for s, method in args: - expected = Series(methods[method](s.values)) - result = getattr(s, method)() - tm.assert_series_equal(result, expected) - - e = pd.Series([False, True, np.nan, False]) - cse = pd.Series([0, 1, np.nan, 1], dtype=object) - cpe = pd.Series([False, 0, np.nan, 0]) - cmin = pd.Series([False, False, np.nan, False]) - cmax = pd.Series([False, True, np.nan, True]) - expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} - - for method in methods: - res = getattr(e, method)() - tm.assert_series_equal(res, expecteds[method]) - def test_isin(self): s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) @@ -803,54 +762,6 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(s, 2, axis=0) - def test_searchsorted(self): - s = Series([1, 2, 3]) - - result = s.searchsorted(1, side="left") - assert is_scalar(result) - assert result == 0 - - result = s.searchsorted(1, side="right") - assert is_scalar(result) - assert result == 1 - - def test_searchsorted_numeric_dtypes_scalar(self): - s = Series([1, 2, 90, 1000, 3e9]) - r = s.searchsorted(30) - assert is_scalar(r) - assert r == 2 - - r = s.searchsorted([30]) - e = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_searchsorted_numeric_dtypes_vector(self): - s = Series([1, 2, 90, 1000, 3e9]) - r = s.searchsorted([91, 2e6]) - e = np.array([3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_search_sorted_datetime64_scalar(self): - s = Series(pd.date_range("20120101", periods=10, freq="2D")) - v = pd.Timestamp("20120102") - r = s.searchsorted(v) - assert is_scalar(r) - assert r == 1 - - def test_search_sorted_datetime64_list(self): - s = Series(pd.date_range("20120101", periods=10, freq="2D")) - v = [pd.Timestamp("20120102"), pd.Timestamp("20120104")] - r = s.searchsorted(v) - e = np.array([1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_searchsorted_sorter(self): - # GH8490 - s = Series([3, 1, 2]) - r = s.searchsorted([0, 3], sorter=np.argsort(s)) - e = np.array([0, 2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - def test_is_monotonic(self): s = Series(np.random.randint(0, 10, size=1000)) @@ -1004,117 +915,6 @@ def test_unstack(self): right.index = pd.MultiIndex.from_tuples(tpls) tm.assert_frame_equal(ts.unstack(level=0), right) - def test_value_counts_datetime(self): - # most dtypes are tested in tests/base - values = [ - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 10:00"), - pd.Timestamp("2011-01-01 11:00"), - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 11:00"), - ] - - exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] - ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check DatetimeIndex outputs the same result - idx = pd.DatetimeIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_datetime_tz(self): - values = [ - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), - ] - - exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], - tz="US/Eastern", - ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - idx = pd.DatetimeIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_period(self): - values = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check DatetimeIndex outputs the same result - idx = pd.PeriodIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_categorical_ordered(self): - # most dtypes are tested in tests/base - values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) - - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_categorical_not_ordered(self): - values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) - - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) @td.skip_if_np_lt("1.15") @@ -1166,207 +966,6 @@ def test_validate_stat_keepdims(self): np.sum(s, keepdims=True) -main_dtypes = [ - "datetime", - "datetimetz", - "timedelta", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", -] - - -@pytest.fixture -def s_main_dtypes(): - """A DataFrame with many dtypes - - * datetime - * datetimetz - * timedelta - * [u]int{8,16,32,64} - * float{32,64} - - The columns are the name of the dtype. - """ - df = pd.DataFrame( - { - "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), - "datetimetz": pd.to_datetime( - ["2003", "2002", "2001", "2002", "2005"] - ).tz_localize("US/Eastern"), - "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), - } - ) - - for dtype in [ - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", - ]: - df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) - - return df - - -@pytest.fixture(params=main_dtypes) -def s_main_dtypes_split(request, s_main_dtypes): - """Each series in s_main_dtypes.""" - return s_main_dtypes[request.param] - - -def assert_check_nselect_boundary(vals, dtype, method): - # helper function for 'test_boundary_{dtype}' tests - s = Series(vals, dtype=dtype) - result = getattr(s, method)(3) - expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] - expected = s.loc[expected_idxr] - tm.assert_series_equal(result, expected) - - -class TestNLargestNSmallest: - @pytest.mark.parametrize( - "r", - [ - Series([3.0, 2, 1, 2, "5"], dtype="object"), - Series([3.0, 2, 1, 2, 5], dtype="object"), - # not supported on some archs - # Series([3., 2, 1, 2, 5], dtype='complex256'), - Series([3.0, 2, 1, 2, 5], dtype="complex128"), - Series(list("abcde")), - Series(list("abcde"), dtype="category"), - ], - ) - def test_error(self, r): - dt = r.dtype - msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) - args = 2, len(r), 0, -1 - methods = r.nlargest, r.nsmallest - for method, arg in product(methods, args): - with pytest.raises(TypeError, match=msg): - method(arg) - - def test_nsmallest_nlargest(self, s_main_dtypes_split): - # float, int, datetime64 (use i8), timedelts64 (same), - # object that are numbers, object that are strings - s = s_main_dtypes_split - - tm.assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - tm.assert_series_equal(s.nsmallest(2, keep="last"), s.iloc[[2, 3]]) - - empty = s.iloc[0:0] - tm.assert_series_equal(s.nsmallest(0), empty) - tm.assert_series_equal(s.nsmallest(-1), empty) - tm.assert_series_equal(s.nlargest(0), empty) - tm.assert_series_equal(s.nlargest(-1), empty) - - tm.assert_series_equal(s.nsmallest(len(s)), s.sort_values()) - tm.assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) - tm.assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - tm.assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) - - def test_misc(self): - - s = Series([3.0, np.nan, 1, 2, 5]) - tm.assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) - tm.assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) - - msg = 'keep must be either "first", "last"' - with pytest.raises(ValueError, match=msg): - s.nsmallest(keep="invalid") - with pytest.raises(ValueError, match=msg): - s.nlargest(keep="invalid") - - # GH 15297 - s = Series([1] * 5, index=[1, 2, 3, 4, 5]) - expected_first = Series([1] * 3, index=[1, 2, 3]) - expected_last = Series([1] * 3, index=[5, 4, 3]) - - result = s.nsmallest(3) - tm.assert_series_equal(result, expected_first) - - result = s.nsmallest(3, keep="last") - tm.assert_series_equal(result, expected_last) - - result = s.nlargest(3) - tm.assert_series_equal(result, expected_first) - - result = s.nlargest(3, keep="last") - tm.assert_series_equal(result, expected_last) - - @pytest.mark.parametrize("n", range(1, 5)) - def test_n(self, n): - - # GH 13412 - s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) - result = s.nlargest(n) - expected = s.sort_values(ascending=False).head(n) - tm.assert_series_equal(result, expected) - - result = s.nsmallest(n) - expected = s.sort_values().head(n) - tm.assert_series_equal(result, expected) - - def test_boundary_integer(self, nselect_method, any_int_dtype): - # GH 21426 - dtype_info = np.iinfo(any_int_dtype) - min_val, max_val = dtype_info.min, dtype_info.max - vals = [min_val, min_val + 1, max_val - 1, max_val] - assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) - - def test_boundary_float(self, nselect_method, float_dtype): - # GH 21426 - dtype_info = np.finfo(float_dtype) - min_val, max_val = dtype_info.min, dtype_info.max - min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) - vals = [min_val, min_2nd, max_2nd, max_val] - assert_check_nselect_boundary(vals, float_dtype, nselect_method) - - @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) - def test_boundary_datetimelike(self, nselect_method, dtype): - # GH 21426 - # use int64 bounds and +1 to min_val since true minimum is NaT - # (include min_val/NaT at end to maintain same expected_idxr) - dtype_info = np.iinfo("int64") - min_val, max_val = dtype_info.min, dtype_info.max - vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] - assert_check_nselect_boundary(vals, dtype, nselect_method) - - def test_duplicate_keep_all_ties(self): - # see gh-16818 - s = Series([10, 9, 8, 7, 7, 7, 7, 6]) - result = s.nlargest(4, keep="all") - expected = Series([10, 9, 8, 7, 7, 7, 7]) - tm.assert_series_equal(result, expected) - - result = s.nsmallest(2, keep="all") - expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] - ) - def test_boolean(self, data, expected): - # GH 26154 : ensure True > False - s = Series(data) - result = s.nlargest(1) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - class TestCategoricalSeriesAnalytics: def test_count(self): @@ -1378,67 +977,6 @@ def test_count(self): result = s.count() assert result == 2 - def test_value_counts(self): - # GH 12835 - cats = Categorical(list("abcccb"), categories=list("cabd")) - s = Series(cats, name="xxx") - res = s.value_counts(sort=False) - - exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) - tm.assert_series_equal(res, exp) - - res = s.value_counts(sort=True) - - exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) - tm.assert_series_equal(res, exp) - - # check object dtype handles the Series.name as the same - # (tested in tests/base) - s = Series(["a", "b", "c", "c", "c", "b"], name="xxx") - res = s.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) - tm.assert_series_equal(res, exp) - - def test_value_counts_with_nan(self): - # see gh-9443 - - # sanity check - s = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) - - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - # same Series via two different constructions --> same behaviour - series = [ - Series(["a", "b", None, "a", None, None], dtype="category"), - Series( - Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) - ), - ] - - for s in series: - # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - # we don't exclude the count of None and sort by counts - exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) - res = s.value_counts(dropna=False) - tm.assert_series_equal(res, exp) - - # When we aren't sorting by counts, and np.nan isn't a - # category, it should be last. - exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) - res = s.value_counts(dropna=False, sort=False) - tm.assert_series_equal(res, exp) - @pytest.mark.parametrize( "dtype", ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a31cc9d968f3a..0fac279291c66 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -5,6 +5,8 @@ -------- tests.frame.test_cumulative """ +from itertools import product + import numpy as np import pytest @@ -140,3 +142,33 @@ def test_cummax_timedelta64(self): ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) + + def test_cummethods_bool(self): + # GH#6270 + + a = pd.Series([False, False, False, True, True, False, False]) + b = ~a + c = pd.Series([False] * len(b)) + d = ~c + methods = { + "cumsum": np.cumsum, + "cumprod": np.cumprod, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + } + args = product((a, b, c, d), methods) + for s, method in args: + expected = pd.Series(methods[method](s.values)) + result = getattr(s, method)() + tm.assert_series_equal(result, expected) + + e = pd.Series([False, True, np.nan, False]) + cse = pd.Series([0, 1, np.nan, 1], dtype=object) + cpe = pd.Series([False, 0, np.nan, 0]) + cmin = pd.Series([False, False, np.nan, False]) + cmax = pd.Series([False, True, np.nan, True]) + expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} + + for method in methods: + res = getattr(e, method)() + tm.assert_series_equal(res, expecteds[method])