From 27a2bb7c73697b7274bdb864dc09b29dfc817574 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 13 Jul 2018 21:30:27 +0200 Subject: [PATCH 1/4] TST: add test for duplicated frame/test_analytics --- pandas/tests/frame/test_analytics.py | 103 +++++++++++++++++++-------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index c0e9b89c1877f..37edf8764ce00 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, PY35 +from pandas.compat import lrange, PY35, string_types from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, @@ -1545,6 +1545,77 @@ def test_isin_empty_datetimelike(self): # ---------------------------------------------------------------------- # Row deduplication + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) + def test_duplicated_with_misspelled_column_name(self, subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_keep(self, keep, expected): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_nan_none(self, keep, expected): + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('keep', ['first', 'last', False]) + @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) + def test_duplicated_subset(self, subset, keep): + df = DataFrame({'A': [0, 1, 1, 2, 0], + 'B': ['a', 'b', 'b', 'c', 'a'], + 'C': [np.nan, 3, 3, None, np.nan]}) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, string_types): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + def test_drop_duplicates(self): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], @@ -1640,36 +1711,6 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) - def test_duplicated_with_misspelled_column_name(self, subset): - # GH 19730 - df = pd.DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) - - with pytest.raises(KeyError): - df.duplicated(subset) - - with pytest.raises(KeyError): - df.drop_duplicates(subset) - - @pytest.mark.slow - def test_duplicated_do_not_fail_on_wide_dataframes(self): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} - df = pd.DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool pd.Series as a result - # and don't fail during calculation. - # Actual values doesn't matter here, though usually - # it's all False in this case - assert isinstance(result, pd.Series) - assert result.dtype == np.bool - def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From add4ac1b7709515688dcf5e2b8de334ab526b7ac Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 18:09:45 +0200 Subject: [PATCH 2/4] Split off tests for duplicates into separate file --- pandas/tests/frame/test_analytics.py | 421 +------------------------ pandas/tests/frame/test_duplicates.py | 433 ++++++++++++++++++++++++++ 2 files changed, 434 insertions(+), 420 deletions(-) create mode 100644 pandas/tests/frame/test_duplicates.py diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 37edf8764ce00..a399fa2b68680 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, PY35, string_types +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, @@ -1542,425 +1542,6 @@ def test_isin_empty_datetimelike(self): result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- - # Row deduplication - - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) - def test_duplicated_with_misspelled_column_name(self, subset): - # GH 19730 - df = pd.DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) - - with pytest.raises(KeyError): - df.duplicated(subset) - - with pytest.raises(KeyError): - df.drop_duplicates(subset) - - @pytest.mark.slow - def test_duplicated_do_not_fail_on_wide_dataframes(self): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} - df = pd.DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool pd.Series as a result - # and don't fail during calculation. - # Actual values doesn't matter here, though usually - # it's all False in this case - assert isinstance(result, pd.Series) - assert result.dtype == np.bool - - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) - ]) - def test_duplicated_keep(self, keep, expected): - df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - @pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) - ]) - def test_duplicated_nan_none(self, keep, expected): - df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('keep', ['first', 'last', False]) - @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) - def test_duplicated_subset(self, subset, keep): - df = DataFrame({'A': [0, 1, 1, 2, 0], - 'B': ['a', 'b', 'b', 'c', 'a'], - 'C': [np.nan, 3, 3, None, np.nan]}) - - if subset is None: - subset = list(df.columns) - elif isinstance(subset, string_types): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - - expected = df[subset].duplicated(keep=keep) - result = df.duplicated(keep=keep, subset=subset) - tm.assert_series_equal(result, expected) - - def test_drop_duplicates(self): - df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.loc[[]] - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates(np.array(['AAA', 'B'])) - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep='last') - expected = df.loc[[0, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep=False) - expected = df.loc[[0]] - tm.assert_frame_equal(result, expected) - - # consider everything - df2 = df.loc[:, ['AAA', 'B', 'C']] - - result = df2.drop_duplicates() - # in this case only - expected = df2.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep='last') - expected = df2.drop_duplicates(['AAA', 'B'], keep='last') - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep=False) - expected = df2.drop_duplicates(['AAA', 'B'], keep=False) - tm.assert_frame_equal(result, expected) - - # integers - result = df.drop_duplicates('C') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - df['E'] = df['C'].astype('int8') - result = df.drop_duplicates('E') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('E', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - # GH 11376 - df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) - expected = df.loc[df.index != 3] - tm.assert_frame_equal(df.drop_duplicates(), expected) - - df = pd.DataFrame([[1, 0], [0, 2]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-2, 0], [0, -4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - x = np.iinfo(np.int64).max / 3 * 2 - df = pd.DataFrame([[-x, x], [0, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-x, x], [x, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - # GH 11864 - df = pd.DataFrame([i] * 9 for i in range(16)) - df = df.append([[1] + [0] * 8], ignore_index=True) - - for keep in ['first', 'last', False]: - assert df.duplicated(keep=keep).sum() == 0 - - def test_drop_duplicates_with_duplicate_column_names(self): - # GH17836 - df = DataFrame([ - [1, 2, 5], - [3, 4, 6], - [3, 4, 7] - ], columns=['a', 'a', 'b']) - - result0 = df.drop_duplicates() - tm.assert_frame_equal(result0, df) - - result1 = df.drop_duplicates('a') - expected1 = df[:2] - tm.assert_frame_equal(result1, expected1) - - def test_drop_duplicates_for_take_all(self): - df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', - 'foo', 'bar', 'qux', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.iloc[[2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.iloc[[2, 6]] - tm.assert_frame_equal(result, expected) - - # multiple columns - result = df.drop_duplicates(['AAA', 'B']) - expected = df.iloc[[0, 1, 2, 3, 4, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep='last') - expected = df.iloc[[0, 1, 2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep=False) - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_tuple(self): - df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates(('AA', 'AB')) - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep=False) - expected = df.loc[[]] # empty df - assert len(result) == 0 - tm.assert_frame_equal(result, expected) - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates((('AA', 'AB'), 'B')) - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('A') - expected = df.loc[[0, 2, 3]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.loc[[1, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['A', 'B']) - expected = df.loc[[0, 2, 3, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep='last') - expected = df.loc[[1, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep=False) - expected = df.loc[[6]] - tm.assert_frame_equal(result, expected) - - # nan - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('C') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.loc[[3, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['C', 'B']) - expected = df.loc[[0, 1, 2, 4]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep='last') - expected = df.loc[[1, 3, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep=False) - expected = df.loc[[1]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA_for_take_all(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'baz', 'bar', 'qux'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) - - # single column - result = df.drop_duplicates('A') - expected = df.iloc[[0, 2, 3, 5, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.iloc[[1, 4, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.iloc[[5, 7]] - tm.assert_frame_equal(result, expected) - - # nan - - # single column - result = df.drop_duplicates('C') - expected = df.iloc[[0, 1, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[3, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.iloc[[5, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_inplace(self): - orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - df = orig.copy() - df.drop_duplicates('A', inplace=True) - expected = orig[:2] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep='last', inplace=True) - expected = orig.loc[[6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep=False, inplace=True) - expected = orig.loc[[]] - result = df - tm.assert_frame_equal(result, expected) - assert len(df) == 0 - - # multi column - df = orig.copy() - df.drop_duplicates(['A', 'B'], inplace=True) - expected = orig.loc[[0, 1, 2, 3]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep='last', inplace=True) - expected = orig.loc[[0, 5, 6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep=False, inplace=True) - expected = orig.loc[[0]] - result = df - tm.assert_frame_equal(result, expected) - - # consider everything - orig2 = orig.loc[:, ['A', 'B', 'C']].copy() - - df2 = orig2.copy() - df2.drop_duplicates(inplace=True) - # in this case only - expected = orig2.drop_duplicates(['A', 'B']) - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep='last', inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep='last') - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep=False, inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep=False) - result = df2 - tm.assert_frame_equal(result, expected) - # Rounding def test_round(self): # GH 2665 diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py new file mode 100644 index 0000000000000..e8cab0cd378af --- /dev/null +++ b/pandas/tests/frame/test_duplicates.py @@ -0,0 +1,433 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import pytest + +import pandas as pd +import numpy as np + +from pandas.compat import lrange, string_types +from pandas import DataFrame, Series + +import pandas.util.testing as tm +from pandas.tests.frame.common import TestData + + +class TestDataFrameDuplicates(TestData): + + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) + def test_duplicated_with_misspelled_column_name(self, subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_keep(self, keep, expected): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_nan_none(self, keep, expected): + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('keep', ['first', 'last', False]) + @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) + def test_duplicated_subset(self, subset, keep): + df = DataFrame({'A': [0, 1, 1, 2, 0], + 'B': ['a', 'b', 'b', 'c', 'a'], + 'C': [np.nan, 3, 3, None, np.nan]}) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, string_types): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + def test_drop_duplicates(self): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(['AAA', 'B'])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep='last') + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ['AAA', 'B', 'C']] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep='last') + expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates('C') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df['E'] = df['C'].astype('int8') + result = df.drop_duplicates('E') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('E', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], + 'y': [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = pd.DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = pd.DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = pd.DataFrame([i] * 9 for i in range(16)) + df = df.append([[1] + [0] * 8], ignore_index=True) + + for keep in ['first', 'last', False]: + assert df.duplicated(keep=keep).sum() == 0 + + def test_drop_duplicates_with_duplicate_column_names(self): + # GH17836 + df = DataFrame([ + [1, 2, 5], + [3, 4, 6], + [3, 4, 7] + ], columns=['a', 'a', 'b']) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates('a') + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + def test_drop_duplicates_for_take_all(self): + df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', + 'foo', 'bar', 'qux', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(['AAA', 'B']) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep='last') + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + def test_drop_duplicates_tuple(self): + df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates(('AA', 'AB')) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((('AA', 'AB'), 'B')) + tm.assert_frame_equal(result, expected) + + def test_drop_duplicates_NA(self): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep='last') + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep='last') + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + def test_drop_duplicates_NA_for_take_all(self): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'baz', 'bar', 'qux'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + + # single column + result = df.drop_duplicates('A') + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates('C') + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + def test_drop_duplicates_inplace(self): + orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep='last', inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.loc[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep='last', inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep='last') + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep=False) + result = df2 + tm.assert_frame_equal(result, expected) From e8a8d423339b8514d935880be03cb09e77b44cf4 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:08:50 +0200 Subject: [PATCH 3/4] Change tests away from classes --- pandas/tests/frame/test_duplicates.py | 848 +++++++++++++------------- 1 file changed, 428 insertions(+), 420 deletions(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index e8cab0cd378af..1421c03c0e343 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -11,423 +11,431 @@ from pandas import DataFrame, Series import pandas.util.testing as tm -from pandas.tests.frame.common import TestData - - -class TestDataFrameDuplicates(TestData): - - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) - def test_duplicated_with_misspelled_column_name(self, subset): - # GH 19730 - df = pd.DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) - - with pytest.raises(KeyError): - df.duplicated(subset) - - with pytest.raises(KeyError): - df.drop_duplicates(subset) - - @pytest.mark.slow - def test_duplicated_do_not_fail_on_wide_dataframes(self): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} - df = pd.DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool pd.Series as a result - # and don't fail during calculation. - # Actual values doesn't matter here, though usually - # it's all False in this case - assert isinstance(result, pd.Series) - assert result.dtype == np.bool - - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) - ]) - def test_duplicated_keep(self, keep, expected): - df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - @pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) - ]) - def test_duplicated_nan_none(self, keep, expected): - df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('keep', ['first', 'last', False]) - @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) - def test_duplicated_subset(self, subset, keep): - df = DataFrame({'A': [0, 1, 1, 2, 0], - 'B': ['a', 'b', 'b', 'c', 'a'], - 'C': [np.nan, 3, 3, None, np.nan]}) - - if subset is None: - subset = list(df.columns) - elif isinstance(subset, string_types): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - - expected = df[subset].duplicated(keep=keep) - result = df.duplicated(keep=keep, subset=subset) - tm.assert_series_equal(result, expected) - - def test_drop_duplicates(self): - df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.loc[[]] - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates(np.array(['AAA', 'B'])) - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep='last') - expected = df.loc[[0, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep=False) - expected = df.loc[[0]] - tm.assert_frame_equal(result, expected) - - # consider everything - df2 = df.loc[:, ['AAA', 'B', 'C']] - - result = df2.drop_duplicates() - # in this case only - expected = df2.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep='last') - expected = df2.drop_duplicates(['AAA', 'B'], keep='last') - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep=False) - expected = df2.drop_duplicates(['AAA', 'B'], keep=False) - tm.assert_frame_equal(result, expected) - - # integers - result = df.drop_duplicates('C') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - df['E'] = df['C'].astype('int8') - result = df.drop_duplicates('E') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('E', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - # GH 11376 - df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) - expected = df.loc[df.index != 3] - tm.assert_frame_equal(df.drop_duplicates(), expected) - - df = pd.DataFrame([[1, 0], [0, 2]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-2, 0], [0, -4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - x = np.iinfo(np.int64).max / 3 * 2 - df = pd.DataFrame([[-x, x], [0, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-x, x], [x, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - # GH 11864 - df = pd.DataFrame([i] * 9 for i in range(16)) - df = df.append([[1] + [0] * 8], ignore_index=True) - - for keep in ['first', 'last', False]: - assert df.duplicated(keep=keep).sum() == 0 - - def test_drop_duplicates_with_duplicate_column_names(self): - # GH17836 - df = DataFrame([ - [1, 2, 5], - [3, 4, 6], - [3, 4, 7] - ], columns=['a', 'a', 'b']) - - result0 = df.drop_duplicates() - tm.assert_frame_equal(result0, df) - - result1 = df.drop_duplicates('a') - expected1 = df[:2] - tm.assert_frame_equal(result1, expected1) - - def test_drop_duplicates_for_take_all(self): - df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', - 'foo', 'bar', 'qux', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.iloc[[2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.iloc[[2, 6]] - tm.assert_frame_equal(result, expected) - - # multiple columns - result = df.drop_duplicates(['AAA', 'B']) - expected = df.iloc[[0, 1, 2, 3, 4, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep='last') - expected = df.iloc[[0, 1, 2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep=False) - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_tuple(self): - df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates(('AA', 'AB')) - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep=False) - expected = df.loc[[]] # empty df - assert len(result) == 0 - tm.assert_frame_equal(result, expected) - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates((('AA', 'AB'), 'B')) - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('A') - expected = df.loc[[0, 2, 3]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.loc[[1, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['A', 'B']) - expected = df.loc[[0, 2, 3, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep='last') - expected = df.loc[[1, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep=False) - expected = df.loc[[6]] - tm.assert_frame_equal(result, expected) - - # nan - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('C') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.loc[[3, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['C', 'B']) - expected = df.loc[[0, 1, 2, 4]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep='last') - expected = df.loc[[1, 3, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep=False) - expected = df.loc[[1]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA_for_take_all(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'baz', 'bar', 'qux'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) - - # single column - result = df.drop_duplicates('A') - expected = df.iloc[[0, 2, 3, 5, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.iloc[[1, 4, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.iloc[[5, 7]] - tm.assert_frame_equal(result, expected) - - # nan - - # single column - result = df.drop_duplicates('C') - expected = df.iloc[[0, 1, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[3, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.iloc[[5, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_inplace(self): - orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - df = orig.copy() - df.drop_duplicates('A', inplace=True) - expected = orig[:2] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep='last', inplace=True) - expected = orig.loc[[6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep=False, inplace=True) - expected = orig.loc[[]] - result = df - tm.assert_frame_equal(result, expected) - assert len(df) == 0 - - # multi column - df = orig.copy() - df.drop_duplicates(['A', 'B'], inplace=True) - expected = orig.loc[[0, 1, 2, 3]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep='last', inplace=True) - expected = orig.loc[[0, 5, 6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep=False, inplace=True) - expected = orig.loc[[0]] - result = df - tm.assert_frame_equal(result, expected) - - # consider everything - orig2 = orig.loc[:, ['A', 'B', 'C']].copy() - - df2 = orig2.copy() - df2.drop_duplicates(inplace=True) - # in this case only - expected = orig2.drop_duplicates(['A', 'B']) - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep='last', inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep='last') - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep=False, inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep=False) - result = df2 - tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) +]) +def test_duplicated_keep(keep, expected): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) +]) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last', False]) +@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +def test_duplicated_subset(subset, keep): + df = DataFrame({'A': [0, 1, 1, 2, 0], + 'B': ['a', 'b', 'b', 'c', 'a'], + 'C': [np.nan, 3, 3, None, np.nan]}) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, string_types): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_drop_duplicates(): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(['AAA', 'B'])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep='last') + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ['AAA', 'B', 'C']] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep='last') + expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates('C') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df['E'] = df['C'].astype('int8') + result = df.drop_duplicates('E') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('E', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], + 'y': [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = pd.DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = pd.DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = pd.DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = pd.DataFrame([i] * 9 for i in range(16)) + df = df.append([[1] + [0] * 8], ignore_index=True) + + for keep in ['first', 'last', False]: + assert df.duplicated(keep=keep).sum() == 0 + + +def test_drop_duplicates_with_duplicate_column_names(): + # GH17836 + df = DataFrame([ + [1, 2, 5], + [3, 4, 6], + [3, 4, 7] + ], columns=['a', 'a', 'b']) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates('a') + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + +def test_drop_duplicates_for_take_all(): + df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', + 'foo', 'bar', 'qux', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(['AAA', 'B']) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep='last') + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_tuple(): + df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates(('AA', 'AB')) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((('AA', 'AB'), 'B')) + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA(): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep='last') + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep='last') + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA_for_take_all(): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'baz', 'bar', 'qux'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + + # single column + result = df.drop_duplicates('A') + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates('C') + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_inplace(): + orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep='last', inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.loc[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep='last', inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep='last') + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep=False) + result = df2 + tm.assert_frame_equal(result, expected) From 9d428e25274854e77a85a73e6fbbaadcd90dd052 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:34:21 +0200 Subject: [PATCH 4/4] Further cleanup --- pandas/tests/frame/test_duplicates.py | 32 +++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 1421c03c0e343..289170527dea7 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -4,7 +4,6 @@ import pytest -import pandas as pd import numpy as np from pandas.compat import lrange, string_types @@ -16,9 +15,9 @@ @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) def test_duplicated_with_misspelled_column_name(subset): # GH 19730 - df = pd.DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) + df = DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) with pytest.raises(KeyError): df.duplicated(subset) @@ -34,14 +33,13 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): # with different (important!) values data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) for i in range(100)} - df = pd.DataFrame(data).T + df = DataFrame(data).T result = df.duplicated() - # Then duplicates produce the bool pd.Series as a result - # and don't fail during calculation. - # Actual values doesn't matter here, though usually - # it's all False in this case - assert isinstance(result, pd.Series) + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) assert result.dtype == np.bool @@ -159,26 +157,26 @@ def test_drop_duplicates(): tm.assert_frame_equal(result, expected) # GH 11376 - df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) + df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], + 'y': [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) - df = pd.DataFrame([[1, 0], [0, 2]]) + df = DataFrame([[1, 0], [0, 2]]) tm.assert_frame_equal(df.drop_duplicates(), df) - df = pd.DataFrame([[-2, 0], [0, -4]]) + df = DataFrame([[-2, 0], [0, -4]]) tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 - df = pd.DataFrame([[-x, x], [0, x + 4]]) + df = DataFrame([[-x, x], [0, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) - df = pd.DataFrame([[-x, x], [x, x + 4]]) + df = DataFrame([[-x, x], [x, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 - df = pd.DataFrame([i] * 9 for i in range(16)) + df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ['first', 'last', False]: