|
7 | 7 | import sys
|
8 | 8 | import pytest
|
9 | 9 |
|
| 10 | +from string import ascii_lowercase |
10 | 11 | from numpy import nan
|
11 | 12 | from numpy.random import randn
|
12 | 13 | import numpy as np
|
13 | 14 |
|
14 |
| -from pandas.compat import lrange |
| 15 | +from pandas.compat import lrange, product |
15 | 16 | from pandas import (compat, isnull, notnull, DataFrame, Series,
|
16 | 17 | MultiIndex, date_range, Timestamp)
|
17 | 18 | import pandas as pd
|
@@ -1120,73 +1121,6 @@ def __nonzero__(self):
|
1120 | 1121 | self.assertTrue(r1.all())
|
1121 | 1122 |
|
1122 | 1123 | # ----------------------------------------------------------------------
|
1123 |
| - # Top / bottom |
1124 |
| - |
1125 |
| - def test_nlargest(self): |
1126 |
| - # GH10393 |
1127 |
| - from string import ascii_lowercase |
1128 |
| - df = pd.DataFrame({'a': np.random.permutation(10), |
1129 |
| - 'b': list(ascii_lowercase[:10])}) |
1130 |
| - result = df.nlargest(5, 'a') |
1131 |
| - expected = df.sort_values('a', ascending=False).head(5) |
1132 |
| - tm.assert_frame_equal(result, expected) |
1133 |
| - |
1134 |
| - def test_nlargest_multiple_columns(self): |
1135 |
| - from string import ascii_lowercase |
1136 |
| - df = pd.DataFrame({'a': np.random.permutation(10), |
1137 |
| - 'b': list(ascii_lowercase[:10]), |
1138 |
| - 'c': np.random.permutation(10).astype('float64')}) |
1139 |
| - result = df.nlargest(5, ['a', 'b']) |
1140 |
| - expected = df.sort_values(['a', 'b'], ascending=False).head(5) |
1141 |
| - tm.assert_frame_equal(result, expected) |
1142 |
| - |
1143 |
| - def test_nsmallest(self): |
1144 |
| - from string import ascii_lowercase |
1145 |
| - df = pd.DataFrame({'a': np.random.permutation(10), |
1146 |
| - 'b': list(ascii_lowercase[:10])}) |
1147 |
| - result = df.nsmallest(5, 'a') |
1148 |
| - expected = df.sort_values('a').head(5) |
1149 |
| - tm.assert_frame_equal(result, expected) |
1150 |
| - |
1151 |
| - def test_nsmallest_multiple_columns(self): |
1152 |
| - from string import ascii_lowercase |
1153 |
| - df = pd.DataFrame({'a': np.random.permutation(10), |
1154 |
| - 'b': list(ascii_lowercase[:10]), |
1155 |
| - 'c': np.random.permutation(10).astype('float64')}) |
1156 |
| - result = df.nsmallest(5, ['a', 'c']) |
1157 |
| - expected = df.sort_values(['a', 'c']).head(5) |
1158 |
| - tm.assert_frame_equal(result, expected) |
1159 |
| - |
1160 |
| - def test_nsmallest_nlargest_duplicate_index(self): |
1161 |
| - # GH 13412 |
1162 |
| - df = pd.DataFrame({'a': [1, 2, 3, 4], |
1163 |
| - 'b': [4, 3, 2, 1], |
1164 |
| - 'c': [0, 1, 2, 3]}, |
1165 |
| - index=[0, 0, 1, 1]) |
1166 |
| - result = df.nsmallest(4, 'a') |
1167 |
| - expected = df.sort_values('a').head(4) |
1168 |
| - tm.assert_frame_equal(result, expected) |
1169 |
| - |
1170 |
| - result = df.nlargest(4, 'a') |
1171 |
| - expected = df.sort_values('a', ascending=False).head(4) |
1172 |
| - tm.assert_frame_equal(result, expected) |
1173 |
| - |
1174 |
| - result = df.nsmallest(4, ['a', 'c']) |
1175 |
| - expected = df.sort_values(['a', 'c']).head(4) |
1176 |
| - tm.assert_frame_equal(result, expected) |
1177 |
| - |
1178 |
| - result = df.nsmallest(4, ['c', 'a']) |
1179 |
| - expected = df.sort_values(['c', 'a']).head(4) |
1180 |
| - tm.assert_frame_equal(result, expected) |
1181 |
| - |
1182 |
| - result = df.nlargest(4, ['a', 'c']) |
1183 |
| - expected = df.sort_values(['a', 'c'], ascending=False).head(4) |
1184 |
| - tm.assert_frame_equal(result, expected) |
1185 |
| - |
1186 |
| - result = df.nlargest(4, ['c', 'a']) |
1187 |
| - expected = df.sort_values(['c', 'a'], ascending=False).head(4) |
1188 |
| - tm.assert_frame_equal(result, expected) |
1189 |
| - # ---------------------------------------------------------------------- |
1190 | 1124 | # Isin
|
1191 | 1125 |
|
1192 | 1126 | def test_isin(self):
|
@@ -1965,3 +1899,132 @@ def test_dot(self):
|
1965 | 1899 |
|
1966 | 1900 | with tm.assertRaisesRegexp(ValueError, 'aligned'):
|
1967 | 1901 | df.dot(df2)
|
| 1902 | + |
| 1903 | + |
| 1904 | +@pytest.fixture |
| 1905 | +def df_duplicates(): |
| 1906 | + return pd.DataFrame({'a': [1, 2, 3, 4, 4], |
| 1907 | + 'b': [1, 1, 1, 1, 1], |
| 1908 | + 'c': [0, 1, 2, 5, 4]}, |
| 1909 | + index=[0, 0, 1, 1, 1]) |
| 1910 | + |
| 1911 | + |
| 1912 | +@pytest.fixture |
| 1913 | +def df_strings(): |
| 1914 | + return pd.DataFrame({'a': np.random.permutation(10), |
| 1915 | + 'b': list(ascii_lowercase[:10]), |
| 1916 | + 'c': np.random.permutation(10).astype('float64')}) |
| 1917 | + |
| 1918 | + |
| 1919 | +@pytest.fixture |
| 1920 | +def df_main_dtypes(): |
| 1921 | + return pd.DataFrame( |
| 1922 | + {'group': [1, 1, 2], |
| 1923 | + 'int': [1, 2, 3], |
| 1924 | + 'float': [4., 5., 6.], |
| 1925 | + 'string': list('abc'), |
| 1926 | + 'category_string': pd.Series(list('abc')).astype('category'), |
| 1927 | + 'category_int': [7, 8, 9], |
| 1928 | + 'datetime': pd.date_range('20130101', periods=3), |
| 1929 | + 'datetimetz': pd.date_range('20130101', |
| 1930 | + periods=3, |
| 1931 | + tz='US/Eastern'), |
| 1932 | + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, |
| 1933 | + columns=['group', 'int', 'float', 'string', |
| 1934 | + 'category_string', 'category_int', |
| 1935 | + 'datetime', 'datetimetz', |
| 1936 | + 'timedelta']) |
| 1937 | + |
| 1938 | + |
| 1939 | +class TestNLargestNSmallest(object): |
| 1940 | + |
| 1941 | + dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot " |
| 1942 | + "use method {method!r} with this dtype") |
| 1943 | + |
| 1944 | + # ---------------------------------------------------------------------- |
| 1945 | + # Top / bottom |
| 1946 | + @pytest.mark.parametrize( |
| 1947 | + 'method, n, order', |
| 1948 | + product(['nsmallest', 'nlargest'], range(1, 11), |
| 1949 | + [['a'], |
| 1950 | + ['c'], |
| 1951 | + ['a', 'b'], |
| 1952 | + ['a', 'c'], |
| 1953 | + ['b', 'a'], |
| 1954 | + ['b', 'c'], |
| 1955 | + ['a', 'b', 'c'], |
| 1956 | + ['c', 'a', 'b'], |
| 1957 | + ['c', 'b', 'a'], |
| 1958 | + ['b', 'c', 'a'], |
| 1959 | + ['b', 'a', 'c'], |
| 1960 | +
|
| 1961 | + # dups! |
| 1962 | + ['b', 'c', 'c'], |
| 1963 | +
|
| 1964 | + ])) |
| 1965 | + def test_n(self, df_strings, method, n, order): |
| 1966 | + # GH10393 |
| 1967 | + df = df_strings |
| 1968 | + if 'b' in order: |
| 1969 | + |
| 1970 | + error_msg = self.dtype_error_msg_template.format( |
| 1971 | + column='b', method=method, dtype='object') |
| 1972 | + with tm.assertRaisesRegexp(TypeError, error_msg): |
| 1973 | + getattr(df, method)(n, order) |
| 1974 | + else: |
| 1975 | + ascending = method == 'nsmallest' |
| 1976 | + result = getattr(df, method)(n, order) |
| 1977 | + expected = df.sort_values(order, ascending=ascending).head(n) |
| 1978 | + tm.assert_frame_equal(result, expected) |
| 1979 | + |
| 1980 | + @pytest.mark.parametrize( |
| 1981 | + 'method, columns', |
| 1982 | + product(['nsmallest', 'nlargest'], |
| 1983 | + product(['group'], ['category_string', 'string']) |
| 1984 | + )) |
| 1985 | + def test_n_error(self, df_main_dtypes, method, columns): |
| 1986 | + df = df_main_dtypes |
| 1987 | + error_msg = self.dtype_error_msg_template.format( |
| 1988 | + column=columns[1], method=method, dtype=df[columns[1]].dtype) |
| 1989 | + with tm.assertRaisesRegexp(TypeError, error_msg): |
| 1990 | + getattr(df, method)(2, columns) |
| 1991 | + |
| 1992 | + def test_n_all_dtypes(self, df_main_dtypes): |
| 1993 | + df = df_main_dtypes |
| 1994 | + df.nsmallest(2, list(set(df) - {'category_string', 'string'})) |
| 1995 | + df.nlargest(2, list(set(df) - {'category_string', 'string'})) |
| 1996 | + |
| 1997 | + def test_n_identical_values(self): |
| 1998 | + # GH15297 |
| 1999 | + df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]}) |
| 2000 | + |
| 2001 | + result = df.nlargest(3, 'a') |
| 2002 | + expected = pd.DataFrame( |
| 2003 | + {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2] |
| 2004 | + ) |
| 2005 | + tm.assert_frame_equal(result, expected) |
| 2006 | + |
| 2007 | + result = df.nsmallest(3, 'a') |
| 2008 | + expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) |
| 2009 | + tm.assert_frame_equal(result, expected) |
| 2010 | + |
| 2011 | + @pytest.mark.parametrize( |
| 2012 | + 'n, order', |
| 2013 | + product([1, 2, 3, 4, 5], |
| 2014 | + [['a', 'b', 'c'], |
| 2015 | + ['c', 'b', 'a'], |
| 2016 | + ['a'], |
| 2017 | + ['b'], |
| 2018 | + ['a', 'b'], |
| 2019 | + ['c', 'b']])) |
| 2020 | + def test_n_duplicate_index(self, df_duplicates, n, order): |
| 2021 | + # GH 13412 |
| 2022 | + |
| 2023 | + df = df_duplicates |
| 2024 | + result = df.nsmallest(n, order) |
| 2025 | + expected = df.sort_values(order).head(n) |
| 2026 | + tm.assert_frame_equal(result, expected) |
| 2027 | + |
| 2028 | + result = df.nlargest(n, order) |
| 2029 | + expected = df.sort_values(order, ascending=False).head(n) |
| 2030 | + tm.assert_frame_equal(result, expected) |
0 commit comments