|
12 | 12 | from numpy.random import randn
|
13 | 13 | import numpy as np
|
14 | 14 |
|
15 |
| -from pandas.compat import lrange, PY35 |
| 15 | +from pandas.compat import lrange, PY35, string_types |
16 | 16 | from pandas import (compat, isna, notna, DataFrame, Series,
|
17 | 17 | MultiIndex, date_range, Timestamp, Categorical,
|
18 | 18 | _np_version_under1p12,
|
@@ -1545,6 +1545,77 @@ def test_isin_empty_datetimelike(self):
|
1545 | 1545 | # ----------------------------------------------------------------------
|
1546 | 1546 | # Row deduplication
|
1547 | 1547 |
|
| 1548 | + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) |
| 1549 | + def test_duplicated_with_misspelled_column_name(self, subset): |
| 1550 | + # GH 19730 |
| 1551 | + df = pd.DataFrame({'A': [0, 0, 1], |
| 1552 | + 'B': [0, 0, 1], |
| 1553 | + 'C': [0, 0, 1]}) |
| 1554 | + |
| 1555 | + with pytest.raises(KeyError): |
| 1556 | + df.duplicated(subset) |
| 1557 | + |
| 1558 | + with pytest.raises(KeyError): |
| 1559 | + df.drop_duplicates(subset) |
| 1560 | + |
| 1561 | + @pytest.mark.slow |
| 1562 | + def test_duplicated_do_not_fail_on_wide_dataframes(self): |
| 1563 | + # gh-21524 |
| 1564 | + # Given the wide dataframe with a lot of columns |
| 1565 | + # with different (important!) values |
| 1566 | + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) |
| 1567 | + for i in range(100)} |
| 1568 | + df = pd.DataFrame(data).T |
| 1569 | + result = df.duplicated() |
| 1570 | + |
| 1571 | + # Then duplicates produce the bool pd.Series as a result |
| 1572 | + # and don't fail during calculation. |
| 1573 | + # Actual values doesn't matter here, though usually |
| 1574 | + # it's all False in this case |
| 1575 | + assert isinstance(result, pd.Series) |
| 1576 | + assert result.dtype == np.bool |
| 1577 | + |
| 1578 | + @pytest.mark.parametrize('keep, expected', [ |
| 1579 | + ('first', Series([False, False, True, False, True])), |
| 1580 | + ('last', Series([True, True, False, False, False])), |
| 1581 | + (False, Series([True, True, True, False, True])) |
| 1582 | + ]) |
| 1583 | + def test_duplicated_keep(self, keep, expected): |
| 1584 | + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) |
| 1585 | + |
| 1586 | + result = df.duplicated(keep=keep) |
| 1587 | + tm.assert_series_equal(result, expected) |
| 1588 | + |
| 1589 | + @pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") |
| 1590 | + @pytest.mark.parametrize('keep, expected', [ |
| 1591 | + ('first', Series([False, False, True, False, True])), |
| 1592 | + ('last', Series([True, True, False, False, False])), |
| 1593 | + (False, Series([True, True, True, False, True])) |
| 1594 | + ]) |
| 1595 | + def test_duplicated_nan_none(self, keep, expected): |
| 1596 | + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) |
| 1597 | + |
| 1598 | + result = df.duplicated(keep=keep) |
| 1599 | + tm.assert_series_equal(result, expected) |
| 1600 | + |
| 1601 | + @pytest.mark.parametrize('keep', ['first', 'last', False]) |
| 1602 | + @pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) |
| 1603 | + def test_duplicated_subset(self, subset, keep): |
| 1604 | + df = DataFrame({'A': [0, 1, 1, 2, 0], |
| 1605 | + 'B': ['a', 'b', 'b', 'c', 'a'], |
| 1606 | + 'C': [np.nan, 3, 3, None, np.nan]}) |
| 1607 | + |
| 1608 | + if subset is None: |
| 1609 | + subset = list(df.columns) |
| 1610 | + elif isinstance(subset, string_types): |
| 1611 | + # need to have a DataFrame, not a Series |
| 1612 | + # -> select columns with singleton list, not string |
| 1613 | + subset = [subset] |
| 1614 | + |
| 1615 | + expected = df[subset].duplicated(keep=keep) |
| 1616 | + result = df.duplicated(keep=keep, subset=subset) |
| 1617 | + tm.assert_series_equal(result, expected) |
| 1618 | + |
1548 | 1619 | def test_drop_duplicates(self):
|
1549 | 1620 | df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
|
1550 | 1621 | 'foo', 'bar', 'bar', 'foo'],
|
@@ -1640,36 +1711,6 @@ def test_drop_duplicates(self):
|
1640 | 1711 | for keep in ['first', 'last', False]:
|
1641 | 1712 | assert df.duplicated(keep=keep).sum() == 0
|
1642 | 1713 |
|
1643 |
| - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) |
1644 |
| - def test_duplicated_with_misspelled_column_name(self, subset): |
1645 |
| - # GH 19730 |
1646 |
| - df = pd.DataFrame({'A': [0, 0, 1], |
1647 |
| - 'B': [0, 0, 1], |
1648 |
| - 'C': [0, 0, 1]}) |
1649 |
| - |
1650 |
| - with pytest.raises(KeyError): |
1651 |
| - df.duplicated(subset) |
1652 |
| - |
1653 |
| - with pytest.raises(KeyError): |
1654 |
| - df.drop_duplicates(subset) |
1655 |
| - |
1656 |
| - @pytest.mark.slow |
1657 |
| - def test_duplicated_do_not_fail_on_wide_dataframes(self): |
1658 |
| - # gh-21524 |
1659 |
| - # Given the wide dataframe with a lot of columns |
1660 |
| - # with different (important!) values |
1661 |
| - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) |
1662 |
| - for i in range(100)} |
1663 |
| - df = pd.DataFrame(data).T |
1664 |
| - result = df.duplicated() |
1665 |
| - |
1666 |
| - # Then duplicates produce the bool pd.Series as a result |
1667 |
| - # and don't fail during calculation. |
1668 |
| - # Actual values doesn't matter here, though usually |
1669 |
| - # it's all False in this case |
1670 |
| - assert isinstance(result, pd.Series) |
1671 |
| - assert result.dtype == np.bool |
1672 |
| - |
1673 | 1714 | def test_drop_duplicates_with_duplicate_column_names(self):
|
1674 | 1715 | # GH17836
|
1675 | 1716 | df = DataFrame([
|
|
0 commit comments