Skip to content

Commit 04b6bce

Browse files
h-vetinariaeltanawy
authored andcommitted
TST: add test for duplicated frame/test_analytics (pandas-dev#21898)
1 parent d7fdeb1 commit 04b6bce

File tree

2 files changed

+439
-378
lines changed

2 files changed

+439
-378
lines changed

pandas/tests/frame/test_analytics.py

-378
Original file line numberDiff line numberDiff line change
@@ -1542,384 +1542,6 @@ def test_isin_empty_datetimelike(self):
15421542
result = df1_td.isin(df3)
15431543
tm.assert_frame_equal(result, expected)
15441544

1545-
# ----------------------------------------------------------------------
1546-
# Row deduplication
1547-
1548-
def test_drop_duplicates(self):
1549-
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
1550-
'foo', 'bar', 'bar', 'foo'],
1551-
'B': ['one', 'one', 'two', 'two',
1552-
'two', 'two', 'one', 'two'],
1553-
'C': [1, 1, 2, 2, 2, 2, 1, 2],
1554-
'D': lrange(8)})
1555-
1556-
# single column
1557-
result = df.drop_duplicates('AAA')
1558-
expected = df[:2]
1559-
tm.assert_frame_equal(result, expected)
1560-
1561-
result = df.drop_duplicates('AAA', keep='last')
1562-
expected = df.loc[[6, 7]]
1563-
tm.assert_frame_equal(result, expected)
1564-
1565-
result = df.drop_duplicates('AAA', keep=False)
1566-
expected = df.loc[[]]
1567-
tm.assert_frame_equal(result, expected)
1568-
assert len(result) == 0
1569-
1570-
# multi column
1571-
expected = df.loc[[0, 1, 2, 3]]
1572-
result = df.drop_duplicates(np.array(['AAA', 'B']))
1573-
tm.assert_frame_equal(result, expected)
1574-
result = df.drop_duplicates(['AAA', 'B'])
1575-
tm.assert_frame_equal(result, expected)
1576-
1577-
result = df.drop_duplicates(('AAA', 'B'), keep='last')
1578-
expected = df.loc[[0, 5, 6, 7]]
1579-
tm.assert_frame_equal(result, expected)
1580-
1581-
result = df.drop_duplicates(('AAA', 'B'), keep=False)
1582-
expected = df.loc[[0]]
1583-
tm.assert_frame_equal(result, expected)
1584-
1585-
# consider everything
1586-
df2 = df.loc[:, ['AAA', 'B', 'C']]
1587-
1588-
result = df2.drop_duplicates()
1589-
# in this case only
1590-
expected = df2.drop_duplicates(['AAA', 'B'])
1591-
tm.assert_frame_equal(result, expected)
1592-
1593-
result = df2.drop_duplicates(keep='last')
1594-
expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
1595-
tm.assert_frame_equal(result, expected)
1596-
1597-
result = df2.drop_duplicates(keep=False)
1598-
expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
1599-
tm.assert_frame_equal(result, expected)
1600-
1601-
# integers
1602-
result = df.drop_duplicates('C')
1603-
expected = df.iloc[[0, 2]]
1604-
tm.assert_frame_equal(result, expected)
1605-
result = df.drop_duplicates('C', keep='last')
1606-
expected = df.iloc[[-2, -1]]
1607-
tm.assert_frame_equal(result, expected)
1608-
1609-
df['E'] = df['C'].astype('int8')
1610-
result = df.drop_duplicates('E')
1611-
expected = df.iloc[[0, 2]]
1612-
tm.assert_frame_equal(result, expected)
1613-
result = df.drop_duplicates('E', keep='last')
1614-
expected = df.iloc[[-2, -1]]
1615-
tm.assert_frame_equal(result, expected)
1616-
1617-
# GH 11376
1618-
df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
1619-
'y': [0, 6, 5, 5, 9, 1, 2]})
1620-
expected = df.loc[df.index != 3]
1621-
tm.assert_frame_equal(df.drop_duplicates(), expected)
1622-
1623-
df = pd.DataFrame([[1, 0], [0, 2]])
1624-
tm.assert_frame_equal(df.drop_duplicates(), df)
1625-
1626-
df = pd.DataFrame([[-2, 0], [0, -4]])
1627-
tm.assert_frame_equal(df.drop_duplicates(), df)
1628-
1629-
x = np.iinfo(np.int64).max / 3 * 2
1630-
df = pd.DataFrame([[-x, x], [0, x + 4]])
1631-
tm.assert_frame_equal(df.drop_duplicates(), df)
1632-
1633-
df = pd.DataFrame([[-x, x], [x, x + 4]])
1634-
tm.assert_frame_equal(df.drop_duplicates(), df)
1635-
1636-
# GH 11864
1637-
df = pd.DataFrame([i] * 9 for i in range(16))
1638-
df = df.append([[1] + [0] * 8], ignore_index=True)
1639-
1640-
for keep in ['first', 'last', False]:
1641-
assert df.duplicated(keep=keep).sum() == 0
1642-
1643-
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
1644-
def test_duplicated_with_misspelled_column_name(self, subset):
1645-
# GH 19730
1646-
df = pd.DataFrame({'A': [0, 0, 1],
1647-
'B': [0, 0, 1],
1648-
'C': [0, 0, 1]})
1649-
1650-
with pytest.raises(KeyError):
1651-
df.duplicated(subset)
1652-
1653-
with pytest.raises(KeyError):
1654-
df.drop_duplicates(subset)
1655-
1656-
@pytest.mark.slow
1657-
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1658-
# gh-21524
1659-
# Given the wide dataframe with a lot of columns
1660-
# with different (important!) values
1661-
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1662-
for i in range(100)}
1663-
df = pd.DataFrame(data).T
1664-
result = df.duplicated()
1665-
1666-
# Then duplicates produce the bool pd.Series as a result
1667-
# and don't fail during calculation.
1668-
# Actual values doesn't matter here, though usually
1669-
# it's all False in this case
1670-
assert isinstance(result, pd.Series)
1671-
assert result.dtype == np.bool
1672-
1673-
def test_drop_duplicates_with_duplicate_column_names(self):
1674-
# GH17836
1675-
df = DataFrame([
1676-
[1, 2, 5],
1677-
[3, 4, 6],
1678-
[3, 4, 7]
1679-
], columns=['a', 'a', 'b'])
1680-
1681-
result0 = df.drop_duplicates()
1682-
tm.assert_frame_equal(result0, df)
1683-
1684-
result1 = df.drop_duplicates('a')
1685-
expected1 = df[:2]
1686-
tm.assert_frame_equal(result1, expected1)
1687-
1688-
def test_drop_duplicates_for_take_all(self):
1689-
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
1690-
'foo', 'bar', 'qux', 'foo'],
1691-
'B': ['one', 'one', 'two', 'two',
1692-
'two', 'two', 'one', 'two'],
1693-
'C': [1, 1, 2, 2, 2, 2, 1, 2],
1694-
'D': lrange(8)})
1695-
1696-
# single column
1697-
result = df.drop_duplicates('AAA')
1698-
expected = df.iloc[[0, 1, 2, 6]]
1699-
tm.assert_frame_equal(result, expected)
1700-
1701-
result = df.drop_duplicates('AAA', keep='last')
1702-
expected = df.iloc[[2, 5, 6, 7]]
1703-
tm.assert_frame_equal(result, expected)
1704-
1705-
result = df.drop_duplicates('AAA', keep=False)
1706-
expected = df.iloc[[2, 6]]
1707-
tm.assert_frame_equal(result, expected)
1708-
1709-
# multiple columns
1710-
result = df.drop_duplicates(['AAA', 'B'])
1711-
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
1712-
tm.assert_frame_equal(result, expected)
1713-
1714-
result = df.drop_duplicates(['AAA', 'B'], keep='last')
1715-
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
1716-
tm.assert_frame_equal(result, expected)
1717-
1718-
result = df.drop_duplicates(['AAA', 'B'], keep=False)
1719-
expected = df.iloc[[0, 1, 2, 6]]
1720-
tm.assert_frame_equal(result, expected)
1721-
1722-
def test_drop_duplicates_tuple(self):
1723-
df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
1724-
'foo', 'bar', 'bar', 'foo'],
1725-
'B': ['one', 'one', 'two', 'two',
1726-
'two', 'two', 'one', 'two'],
1727-
'C': [1, 1, 2, 2, 2, 2, 1, 2],
1728-
'D': lrange(8)})
1729-
1730-
# single column
1731-
result = df.drop_duplicates(('AA', 'AB'))
1732-
expected = df[:2]
1733-
tm.assert_frame_equal(result, expected)
1734-
1735-
result = df.drop_duplicates(('AA', 'AB'), keep='last')
1736-
expected = df.loc[[6, 7]]
1737-
tm.assert_frame_equal(result, expected)
1738-
1739-
result = df.drop_duplicates(('AA', 'AB'), keep=False)
1740-
expected = df.loc[[]] # empty df
1741-
assert len(result) == 0
1742-
tm.assert_frame_equal(result, expected)
1743-
1744-
# multi column
1745-
expected = df.loc[[0, 1, 2, 3]]
1746-
result = df.drop_duplicates((('AA', 'AB'), 'B'))
1747-
tm.assert_frame_equal(result, expected)
1748-
1749-
def test_drop_duplicates_NA(self):
1750-
# none
1751-
df = DataFrame({'A': [None, None, 'foo', 'bar',
1752-
'foo', 'bar', 'bar', 'foo'],
1753-
'B': ['one', 'one', 'two', 'two',
1754-
'two', 'two', 'one', 'two'],
1755-
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
1756-
'D': lrange(8)})
1757-
1758-
# single column
1759-
result = df.drop_duplicates('A')
1760-
expected = df.loc[[0, 2, 3]]
1761-
tm.assert_frame_equal(result, expected)
1762-
1763-
result = df.drop_duplicates('A', keep='last')
1764-
expected = df.loc[[1, 6, 7]]
1765-
tm.assert_frame_equal(result, expected)
1766-
1767-
result = df.drop_duplicates('A', keep=False)
1768-
expected = df.loc[[]] # empty df
1769-
tm.assert_frame_equal(result, expected)
1770-
assert len(result) == 0
1771-
1772-
# multi column
1773-
result = df.drop_duplicates(['A', 'B'])
1774-
expected = df.loc[[0, 2, 3, 6]]
1775-
tm.assert_frame_equal(result, expected)
1776-
1777-
result = df.drop_duplicates(['A', 'B'], keep='last')
1778-
expected = df.loc[[1, 5, 6, 7]]
1779-
tm.assert_frame_equal(result, expected)
1780-
1781-
result = df.drop_duplicates(['A', 'B'], keep=False)
1782-
expected = df.loc[[6]]
1783-
tm.assert_frame_equal(result, expected)
1784-
1785-
# nan
1786-
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
1787-
'foo', 'bar', 'bar', 'foo'],
1788-
'B': ['one', 'one', 'two', 'two',
1789-
'two', 'two', 'one', 'two'],
1790-
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
1791-
'D': lrange(8)})
1792-
1793-
# single column
1794-
result = df.drop_duplicates('C')
1795-
expected = df[:2]
1796-
tm.assert_frame_equal(result, expected)
1797-
1798-
result = df.drop_duplicates('C', keep='last')
1799-
expected = df.loc[[3, 7]]
1800-
tm.assert_frame_equal(result, expected)
1801-
1802-
result = df.drop_duplicates('C', keep=False)
1803-
expected = df.loc[[]] # empty df
1804-
tm.assert_frame_equal(result, expected)
1805-
assert len(result) == 0
1806-
1807-
# multi column
1808-
result = df.drop_duplicates(['C', 'B'])
1809-
expected = df.loc[[0, 1, 2, 4]]
1810-
tm.assert_frame_equal(result, expected)
1811-
1812-
result = df.drop_duplicates(['C', 'B'], keep='last')
1813-
expected = df.loc[[1, 3, 6, 7]]
1814-
tm.assert_frame_equal(result, expected)
1815-
1816-
result = df.drop_duplicates(['C', 'B'], keep=False)
1817-
expected = df.loc[[1]]
1818-
tm.assert_frame_equal(result, expected)
1819-
1820-
def test_drop_duplicates_NA_for_take_all(self):
1821-
# none
1822-
df = DataFrame({'A': [None, None, 'foo', 'bar',
1823-
'foo', 'baz', 'bar', 'qux'],
1824-
'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
1825-
1826-
# single column
1827-
result = df.drop_duplicates('A')
1828-
expected = df.iloc[[0, 2, 3, 5, 7]]
1829-
tm.assert_frame_equal(result, expected)
1830-
1831-
result = df.drop_duplicates('A', keep='last')
1832-
expected = df.iloc[[1, 4, 5, 6, 7]]
1833-
tm.assert_frame_equal(result, expected)
1834-
1835-
result = df.drop_duplicates('A', keep=False)
1836-
expected = df.iloc[[5, 7]]
1837-
tm.assert_frame_equal(result, expected)
1838-
1839-
# nan
1840-
1841-
# single column
1842-
result = df.drop_duplicates('C')
1843-
expected = df.iloc[[0, 1, 5, 6]]
1844-
tm.assert_frame_equal(result, expected)
1845-
1846-
result = df.drop_duplicates('C', keep='last')
1847-
expected = df.iloc[[3, 5, 6, 7]]
1848-
tm.assert_frame_equal(result, expected)
1849-
1850-
result = df.drop_duplicates('C', keep=False)
1851-
expected = df.iloc[[5, 6]]
1852-
tm.assert_frame_equal(result, expected)
1853-
1854-
def test_drop_duplicates_inplace(self):
1855-
orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
1856-
'foo', 'bar', 'bar', 'foo'],
1857-
'B': ['one', 'one', 'two', 'two',
1858-
'two', 'two', 'one', 'two'],
1859-
'C': [1, 1, 2, 2, 2, 2, 1, 2],
1860-
'D': lrange(8)})
1861-
1862-
# single column
1863-
df = orig.copy()
1864-
df.drop_duplicates('A', inplace=True)
1865-
expected = orig[:2]
1866-
result = df
1867-
tm.assert_frame_equal(result, expected)
1868-
1869-
df = orig.copy()
1870-
df.drop_duplicates('A', keep='last', inplace=True)
1871-
expected = orig.loc[[6, 7]]
1872-
result = df
1873-
tm.assert_frame_equal(result, expected)
1874-
1875-
df = orig.copy()
1876-
df.drop_duplicates('A', keep=False, inplace=True)
1877-
expected = orig.loc[[]]
1878-
result = df
1879-
tm.assert_frame_equal(result, expected)
1880-
assert len(df) == 0
1881-
1882-
# multi column
1883-
df = orig.copy()
1884-
df.drop_duplicates(['A', 'B'], inplace=True)
1885-
expected = orig.loc[[0, 1, 2, 3]]
1886-
result = df
1887-
tm.assert_frame_equal(result, expected)
1888-
1889-
df = orig.copy()
1890-
df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
1891-
expected = orig.loc[[0, 5, 6, 7]]
1892-
result = df
1893-
tm.assert_frame_equal(result, expected)
1894-
1895-
df = orig.copy()
1896-
df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
1897-
expected = orig.loc[[0]]
1898-
result = df
1899-
tm.assert_frame_equal(result, expected)
1900-
1901-
# consider everything
1902-
orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
1903-
1904-
df2 = orig2.copy()
1905-
df2.drop_duplicates(inplace=True)
1906-
# in this case only
1907-
expected = orig2.drop_duplicates(['A', 'B'])
1908-
result = df2
1909-
tm.assert_frame_equal(result, expected)
1910-
1911-
df2 = orig2.copy()
1912-
df2.drop_duplicates(keep='last', inplace=True)
1913-
expected = orig2.drop_duplicates(['A', 'B'], keep='last')
1914-
result = df2
1915-
tm.assert_frame_equal(result, expected)
1916-
1917-
df2 = orig2.copy()
1918-
df2.drop_duplicates(keep=False, inplace=True)
1919-
expected = orig2.drop_duplicates(['A', 'B'], keep=False)
1920-
result = df2
1921-
tm.assert_frame_equal(result, expected)
1922-
19231545
# Rounding
19241546
def test_round(self):
19251547
# GH 2665

0 commit comments

Comments
 (0)