Skip to content

Commit 27a2bb7

Browse files
committed
TST: add test for duplicated frame/test_analytics
1 parent da6e26d commit 27a2bb7

File tree

1 file changed

+72
-31
lines changed

1 file changed

+72
-31
lines changed

pandas/tests/frame/test_analytics.py

+72-31
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy.random import randn
1313
import numpy as np
1414

15-
from pandas.compat import lrange, PY35
15+
from pandas.compat import lrange, PY35, string_types
1616
from pandas import (compat, isna, notna, DataFrame, Series,
1717
MultiIndex, date_range, Timestamp, Categorical,
1818
_np_version_under1p12,
@@ -1545,6 +1545,77 @@ def test_isin_empty_datetimelike(self):
15451545
# ----------------------------------------------------------------------
15461546
# Row deduplication
15471547

1548+
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
1549+
def test_duplicated_with_misspelled_column_name(self, subset):
1550+
# GH 19730
1551+
df = pd.DataFrame({'A': [0, 0, 1],
1552+
'B': [0, 0, 1],
1553+
'C': [0, 0, 1]})
1554+
1555+
with pytest.raises(KeyError):
1556+
df.duplicated(subset)
1557+
1558+
with pytest.raises(KeyError):
1559+
df.drop_duplicates(subset)
1560+
1561+
@pytest.mark.slow
1562+
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1563+
# gh-21524
1564+
# Given the wide dataframe with a lot of columns
1565+
# with different (important!) values
1566+
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1567+
for i in range(100)}
1568+
df = pd.DataFrame(data).T
1569+
result = df.duplicated()
1570+
1571+
# Then duplicates produce the bool pd.Series as a result
1572+
# and don't fail during calculation.
1573+
# Actual values doesn't matter here, though usually
1574+
# it's all False in this case
1575+
assert isinstance(result, pd.Series)
1576+
assert result.dtype == np.bool
1577+
1578+
@pytest.mark.parametrize('keep, expected', [
1579+
('first', Series([False, False, True, False, True])),
1580+
('last', Series([True, True, False, False, False])),
1581+
(False, Series([True, True, True, False, True]))
1582+
])
1583+
def test_duplicated_keep(self, keep, expected):
1584+
df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
1585+
1586+
result = df.duplicated(keep=keep)
1587+
tm.assert_series_equal(result, expected)
1588+
1589+
@pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal")
1590+
@pytest.mark.parametrize('keep, expected', [
1591+
('first', Series([False, False, True, False, True])),
1592+
('last', Series([True, True, False, False, False])),
1593+
(False, Series([True, True, True, False, True]))
1594+
])
1595+
def test_duplicated_nan_none(self, keep, expected):
1596+
df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)
1597+
1598+
result = df.duplicated(keep=keep)
1599+
tm.assert_series_equal(result, expected)
1600+
1601+
@pytest.mark.parametrize('keep', ['first', 'last', False])
1602+
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
1603+
def test_duplicated_subset(self, subset, keep):
1604+
df = DataFrame({'A': [0, 1, 1, 2, 0],
1605+
'B': ['a', 'b', 'b', 'c', 'a'],
1606+
'C': [np.nan, 3, 3, None, np.nan]})
1607+
1608+
if subset is None:
1609+
subset = list(df.columns)
1610+
elif isinstance(subset, string_types):
1611+
# need to have a DataFrame, not a Series
1612+
# -> select columns with singleton list, not string
1613+
subset = [subset]
1614+
1615+
expected = df[subset].duplicated(keep=keep)
1616+
result = df.duplicated(keep=keep, subset=subset)
1617+
tm.assert_series_equal(result, expected)
1618+
15481619
def test_drop_duplicates(self):
15491620
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
15501621
'foo', 'bar', 'bar', 'foo'],
@@ -1640,36 +1711,6 @@ def test_drop_duplicates(self):
16401711
for keep in ['first', 'last', False]:
16411712
assert df.duplicated(keep=keep).sum() == 0
16421713

1643-
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
1644-
def test_duplicated_with_misspelled_column_name(self, subset):
1645-
# GH 19730
1646-
df = pd.DataFrame({'A': [0, 0, 1],
1647-
'B': [0, 0, 1],
1648-
'C': [0, 0, 1]})
1649-
1650-
with pytest.raises(KeyError):
1651-
df.duplicated(subset)
1652-
1653-
with pytest.raises(KeyError):
1654-
df.drop_duplicates(subset)
1655-
1656-
@pytest.mark.slow
1657-
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1658-
# gh-21524
1659-
# Given the wide dataframe with a lot of columns
1660-
# with different (important!) values
1661-
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1662-
for i in range(100)}
1663-
df = pd.DataFrame(data).T
1664-
result = df.duplicated()
1665-
1666-
# Then duplicates produce the bool pd.Series as a result
1667-
# and don't fail during calculation.
1668-
# Actual values doesn't matter here, though usually
1669-
# it's all False in this case
1670-
assert isinstance(result, pd.Series)
1671-
assert result.dtype == np.bool
1672-
16731714
def test_drop_duplicates_with_duplicate_column_names(self):
16741715
# GH17836
16751716
df = DataFrame([

0 commit comments

Comments
 (0)