@@ -1542,384 +1542,6 @@ def test_isin_empty_datetimelike(self):
1542
1542
result = df1_td .isin (df3 )
1543
1543
tm .assert_frame_equal (result , expected )
1544
1544
1545
- # ----------------------------------------------------------------------
1546
- # Row deduplication
1547
-
1548
- def test_drop_duplicates (self ):
1549
- df = DataFrame ({'AAA' : ['foo' , 'bar' , 'foo' , 'bar' ,
1550
- 'foo' , 'bar' , 'bar' , 'foo' ],
1551
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1552
- 'two' , 'two' , 'one' , 'two' ],
1553
- 'C' : [1 , 1 , 2 , 2 , 2 , 2 , 1 , 2 ],
1554
- 'D' : lrange (8 )})
1555
-
1556
- # single column
1557
- result = df .drop_duplicates ('AAA' )
1558
- expected = df [:2 ]
1559
- tm .assert_frame_equal (result , expected )
1560
-
1561
- result = df .drop_duplicates ('AAA' , keep = 'last' )
1562
- expected = df .loc [[6 , 7 ]]
1563
- tm .assert_frame_equal (result , expected )
1564
-
1565
- result = df .drop_duplicates ('AAA' , keep = False )
1566
- expected = df .loc [[]]
1567
- tm .assert_frame_equal (result , expected )
1568
- assert len (result ) == 0
1569
-
1570
- # multi column
1571
- expected = df .loc [[0 , 1 , 2 , 3 ]]
1572
- result = df .drop_duplicates (np .array (['AAA' , 'B' ]))
1573
- tm .assert_frame_equal (result , expected )
1574
- result = df .drop_duplicates (['AAA' , 'B' ])
1575
- tm .assert_frame_equal (result , expected )
1576
-
1577
- result = df .drop_duplicates (('AAA' , 'B' ), keep = 'last' )
1578
- expected = df .loc [[0 , 5 , 6 , 7 ]]
1579
- tm .assert_frame_equal (result , expected )
1580
-
1581
- result = df .drop_duplicates (('AAA' , 'B' ), keep = False )
1582
- expected = df .loc [[0 ]]
1583
- tm .assert_frame_equal (result , expected )
1584
-
1585
- # consider everything
1586
- df2 = df .loc [:, ['AAA' , 'B' , 'C' ]]
1587
-
1588
- result = df2 .drop_duplicates ()
1589
- # in this case only
1590
- expected = df2 .drop_duplicates (['AAA' , 'B' ])
1591
- tm .assert_frame_equal (result , expected )
1592
-
1593
- result = df2 .drop_duplicates (keep = 'last' )
1594
- expected = df2 .drop_duplicates (['AAA' , 'B' ], keep = 'last' )
1595
- tm .assert_frame_equal (result , expected )
1596
-
1597
- result = df2 .drop_duplicates (keep = False )
1598
- expected = df2 .drop_duplicates (['AAA' , 'B' ], keep = False )
1599
- tm .assert_frame_equal (result , expected )
1600
-
1601
- # integers
1602
- result = df .drop_duplicates ('C' )
1603
- expected = df .iloc [[0 , 2 ]]
1604
- tm .assert_frame_equal (result , expected )
1605
- result = df .drop_duplicates ('C' , keep = 'last' )
1606
- expected = df .iloc [[- 2 , - 1 ]]
1607
- tm .assert_frame_equal (result , expected )
1608
-
1609
- df ['E' ] = df ['C' ].astype ('int8' )
1610
- result = df .drop_duplicates ('E' )
1611
- expected = df .iloc [[0 , 2 ]]
1612
- tm .assert_frame_equal (result , expected )
1613
- result = df .drop_duplicates ('E' , keep = 'last' )
1614
- expected = df .iloc [[- 2 , - 1 ]]
1615
- tm .assert_frame_equal (result , expected )
1616
-
1617
- # GH 11376
1618
- df = pd .DataFrame ({'x' : [7 , 6 , 3 , 3 , 4 , 8 , 0 ],
1619
- 'y' : [0 , 6 , 5 , 5 , 9 , 1 , 2 ]})
1620
- expected = df .loc [df .index != 3 ]
1621
- tm .assert_frame_equal (df .drop_duplicates (), expected )
1622
-
1623
- df = pd .DataFrame ([[1 , 0 ], [0 , 2 ]])
1624
- tm .assert_frame_equal (df .drop_duplicates (), df )
1625
-
1626
- df = pd .DataFrame ([[- 2 , 0 ], [0 , - 4 ]])
1627
- tm .assert_frame_equal (df .drop_duplicates (), df )
1628
-
1629
- x = np .iinfo (np .int64 ).max / 3 * 2
1630
- df = pd .DataFrame ([[- x , x ], [0 , x + 4 ]])
1631
- tm .assert_frame_equal (df .drop_duplicates (), df )
1632
-
1633
- df = pd .DataFrame ([[- x , x ], [x , x + 4 ]])
1634
- tm .assert_frame_equal (df .drop_duplicates (), df )
1635
-
1636
- # GH 11864
1637
- df = pd .DataFrame ([i ] * 9 for i in range (16 ))
1638
- df = df .append ([[1 ] + [0 ] * 8 ], ignore_index = True )
1639
-
1640
- for keep in ['first' , 'last' , False ]:
1641
- assert df .duplicated (keep = keep ).sum () == 0
1642
-
1643
- @pytest .mark .parametrize ('subset' , ['a' , ['a' ], ['a' , 'B' ]])
1644
- def test_duplicated_with_misspelled_column_name (self , subset ):
1645
- # GH 19730
1646
- df = pd .DataFrame ({'A' : [0 , 0 , 1 ],
1647
- 'B' : [0 , 0 , 1 ],
1648
- 'C' : [0 , 0 , 1 ]})
1649
-
1650
- with pytest .raises (KeyError ):
1651
- df .duplicated (subset )
1652
-
1653
- with pytest .raises (KeyError ):
1654
- df .drop_duplicates (subset )
1655
-
1656
- @pytest .mark .slow
1657
- def test_duplicated_do_not_fail_on_wide_dataframes (self ):
1658
- # gh-21524
1659
- # Given the wide dataframe with a lot of columns
1660
- # with different (important!) values
1661
- data = {'col_{0:02d}' .format (i ): np .random .randint (0 , 1000 , 30000 )
1662
- for i in range (100 )}
1663
- df = pd .DataFrame (data ).T
1664
- result = df .duplicated ()
1665
-
1666
- # Then duplicates produce the bool pd.Series as a result
1667
- # and don't fail during calculation.
1668
- # Actual values doesn't matter here, though usually
1669
- # it's all False in this case
1670
- assert isinstance (result , pd .Series )
1671
- assert result .dtype == np .bool
1672
-
1673
- def test_drop_duplicates_with_duplicate_column_names (self ):
1674
- # GH17836
1675
- df = DataFrame ([
1676
- [1 , 2 , 5 ],
1677
- [3 , 4 , 6 ],
1678
- [3 , 4 , 7 ]
1679
- ], columns = ['a' , 'a' , 'b' ])
1680
-
1681
- result0 = df .drop_duplicates ()
1682
- tm .assert_frame_equal (result0 , df )
1683
-
1684
- result1 = df .drop_duplicates ('a' )
1685
- expected1 = df [:2 ]
1686
- tm .assert_frame_equal (result1 , expected1 )
1687
-
1688
- def test_drop_duplicates_for_take_all (self ):
1689
- df = DataFrame ({'AAA' : ['foo' , 'bar' , 'baz' , 'bar' ,
1690
- 'foo' , 'bar' , 'qux' , 'foo' ],
1691
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1692
- 'two' , 'two' , 'one' , 'two' ],
1693
- 'C' : [1 , 1 , 2 , 2 , 2 , 2 , 1 , 2 ],
1694
- 'D' : lrange (8 )})
1695
-
1696
- # single column
1697
- result = df .drop_duplicates ('AAA' )
1698
- expected = df .iloc [[0 , 1 , 2 , 6 ]]
1699
- tm .assert_frame_equal (result , expected )
1700
-
1701
- result = df .drop_duplicates ('AAA' , keep = 'last' )
1702
- expected = df .iloc [[2 , 5 , 6 , 7 ]]
1703
- tm .assert_frame_equal (result , expected )
1704
-
1705
- result = df .drop_duplicates ('AAA' , keep = False )
1706
- expected = df .iloc [[2 , 6 ]]
1707
- tm .assert_frame_equal (result , expected )
1708
-
1709
- # multiple columns
1710
- result = df .drop_duplicates (['AAA' , 'B' ])
1711
- expected = df .iloc [[0 , 1 , 2 , 3 , 4 , 6 ]]
1712
- tm .assert_frame_equal (result , expected )
1713
-
1714
- result = df .drop_duplicates (['AAA' , 'B' ], keep = 'last' )
1715
- expected = df .iloc [[0 , 1 , 2 , 5 , 6 , 7 ]]
1716
- tm .assert_frame_equal (result , expected )
1717
-
1718
- result = df .drop_duplicates (['AAA' , 'B' ], keep = False )
1719
- expected = df .iloc [[0 , 1 , 2 , 6 ]]
1720
- tm .assert_frame_equal (result , expected )
1721
-
1722
- def test_drop_duplicates_tuple (self ):
1723
- df = DataFrame ({('AA' , 'AB' ): ['foo' , 'bar' , 'foo' , 'bar' ,
1724
- 'foo' , 'bar' , 'bar' , 'foo' ],
1725
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1726
- 'two' , 'two' , 'one' , 'two' ],
1727
- 'C' : [1 , 1 , 2 , 2 , 2 , 2 , 1 , 2 ],
1728
- 'D' : lrange (8 )})
1729
-
1730
- # single column
1731
- result = df .drop_duplicates (('AA' , 'AB' ))
1732
- expected = df [:2 ]
1733
- tm .assert_frame_equal (result , expected )
1734
-
1735
- result = df .drop_duplicates (('AA' , 'AB' ), keep = 'last' )
1736
- expected = df .loc [[6 , 7 ]]
1737
- tm .assert_frame_equal (result , expected )
1738
-
1739
- result = df .drop_duplicates (('AA' , 'AB' ), keep = False )
1740
- expected = df .loc [[]] # empty df
1741
- assert len (result ) == 0
1742
- tm .assert_frame_equal (result , expected )
1743
-
1744
- # multi column
1745
- expected = df .loc [[0 , 1 , 2 , 3 ]]
1746
- result = df .drop_duplicates ((('AA' , 'AB' ), 'B' ))
1747
- tm .assert_frame_equal (result , expected )
1748
-
1749
- def test_drop_duplicates_NA (self ):
1750
- # none
1751
- df = DataFrame ({'A' : [None , None , 'foo' , 'bar' ,
1752
- 'foo' , 'bar' , 'bar' , 'foo' ],
1753
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1754
- 'two' , 'two' , 'one' , 'two' ],
1755
- 'C' : [1.0 , np .nan , np .nan , np .nan , 1. , 1. , 1 , 1. ],
1756
- 'D' : lrange (8 )})
1757
-
1758
- # single column
1759
- result = df .drop_duplicates ('A' )
1760
- expected = df .loc [[0 , 2 , 3 ]]
1761
- tm .assert_frame_equal (result , expected )
1762
-
1763
- result = df .drop_duplicates ('A' , keep = 'last' )
1764
- expected = df .loc [[1 , 6 , 7 ]]
1765
- tm .assert_frame_equal (result , expected )
1766
-
1767
- result = df .drop_duplicates ('A' , keep = False )
1768
- expected = df .loc [[]] # empty df
1769
- tm .assert_frame_equal (result , expected )
1770
- assert len (result ) == 0
1771
-
1772
- # multi column
1773
- result = df .drop_duplicates (['A' , 'B' ])
1774
- expected = df .loc [[0 , 2 , 3 , 6 ]]
1775
- tm .assert_frame_equal (result , expected )
1776
-
1777
- result = df .drop_duplicates (['A' , 'B' ], keep = 'last' )
1778
- expected = df .loc [[1 , 5 , 6 , 7 ]]
1779
- tm .assert_frame_equal (result , expected )
1780
-
1781
- result = df .drop_duplicates (['A' , 'B' ], keep = False )
1782
- expected = df .loc [[6 ]]
1783
- tm .assert_frame_equal (result , expected )
1784
-
1785
- # nan
1786
- df = DataFrame ({'A' : ['foo' , 'bar' , 'foo' , 'bar' ,
1787
- 'foo' , 'bar' , 'bar' , 'foo' ],
1788
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1789
- 'two' , 'two' , 'one' , 'two' ],
1790
- 'C' : [1.0 , np .nan , np .nan , np .nan , 1. , 1. , 1 , 1. ],
1791
- 'D' : lrange (8 )})
1792
-
1793
- # single column
1794
- result = df .drop_duplicates ('C' )
1795
- expected = df [:2 ]
1796
- tm .assert_frame_equal (result , expected )
1797
-
1798
- result = df .drop_duplicates ('C' , keep = 'last' )
1799
- expected = df .loc [[3 , 7 ]]
1800
- tm .assert_frame_equal (result , expected )
1801
-
1802
- result = df .drop_duplicates ('C' , keep = False )
1803
- expected = df .loc [[]] # empty df
1804
- tm .assert_frame_equal (result , expected )
1805
- assert len (result ) == 0
1806
-
1807
- # multi column
1808
- result = df .drop_duplicates (['C' , 'B' ])
1809
- expected = df .loc [[0 , 1 , 2 , 4 ]]
1810
- tm .assert_frame_equal (result , expected )
1811
-
1812
- result = df .drop_duplicates (['C' , 'B' ], keep = 'last' )
1813
- expected = df .loc [[1 , 3 , 6 , 7 ]]
1814
- tm .assert_frame_equal (result , expected )
1815
-
1816
- result = df .drop_duplicates (['C' , 'B' ], keep = False )
1817
- expected = df .loc [[1 ]]
1818
- tm .assert_frame_equal (result , expected )
1819
-
1820
- def test_drop_duplicates_NA_for_take_all (self ):
1821
- # none
1822
- df = DataFrame ({'A' : [None , None , 'foo' , 'bar' ,
1823
- 'foo' , 'baz' , 'bar' , 'qux' ],
1824
- 'C' : [1.0 , np .nan , np .nan , np .nan , 1. , 2. , 3 , 1. ]})
1825
-
1826
- # single column
1827
- result = df .drop_duplicates ('A' )
1828
- expected = df .iloc [[0 , 2 , 3 , 5 , 7 ]]
1829
- tm .assert_frame_equal (result , expected )
1830
-
1831
- result = df .drop_duplicates ('A' , keep = 'last' )
1832
- expected = df .iloc [[1 , 4 , 5 , 6 , 7 ]]
1833
- tm .assert_frame_equal (result , expected )
1834
-
1835
- result = df .drop_duplicates ('A' , keep = False )
1836
- expected = df .iloc [[5 , 7 ]]
1837
- tm .assert_frame_equal (result , expected )
1838
-
1839
- # nan
1840
-
1841
- # single column
1842
- result = df .drop_duplicates ('C' )
1843
- expected = df .iloc [[0 , 1 , 5 , 6 ]]
1844
- tm .assert_frame_equal (result , expected )
1845
-
1846
- result = df .drop_duplicates ('C' , keep = 'last' )
1847
- expected = df .iloc [[3 , 5 , 6 , 7 ]]
1848
- tm .assert_frame_equal (result , expected )
1849
-
1850
- result = df .drop_duplicates ('C' , keep = False )
1851
- expected = df .iloc [[5 , 6 ]]
1852
- tm .assert_frame_equal (result , expected )
1853
-
1854
- def test_drop_duplicates_inplace (self ):
1855
- orig = DataFrame ({'A' : ['foo' , 'bar' , 'foo' , 'bar' ,
1856
- 'foo' , 'bar' , 'bar' , 'foo' ],
1857
- 'B' : ['one' , 'one' , 'two' , 'two' ,
1858
- 'two' , 'two' , 'one' , 'two' ],
1859
- 'C' : [1 , 1 , 2 , 2 , 2 , 2 , 1 , 2 ],
1860
- 'D' : lrange (8 )})
1861
-
1862
- # single column
1863
- df = orig .copy ()
1864
- df .drop_duplicates ('A' , inplace = True )
1865
- expected = orig [:2 ]
1866
- result = df
1867
- tm .assert_frame_equal (result , expected )
1868
-
1869
- df = orig .copy ()
1870
- df .drop_duplicates ('A' , keep = 'last' , inplace = True )
1871
- expected = orig .loc [[6 , 7 ]]
1872
- result = df
1873
- tm .assert_frame_equal (result , expected )
1874
-
1875
- df = orig .copy ()
1876
- df .drop_duplicates ('A' , keep = False , inplace = True )
1877
- expected = orig .loc [[]]
1878
- result = df
1879
- tm .assert_frame_equal (result , expected )
1880
- assert len (df ) == 0
1881
-
1882
- # multi column
1883
- df = orig .copy ()
1884
- df .drop_duplicates (['A' , 'B' ], inplace = True )
1885
- expected = orig .loc [[0 , 1 , 2 , 3 ]]
1886
- result = df
1887
- tm .assert_frame_equal (result , expected )
1888
-
1889
- df = orig .copy ()
1890
- df .drop_duplicates (['A' , 'B' ], keep = 'last' , inplace = True )
1891
- expected = orig .loc [[0 , 5 , 6 , 7 ]]
1892
- result = df
1893
- tm .assert_frame_equal (result , expected )
1894
-
1895
- df = orig .copy ()
1896
- df .drop_duplicates (['A' , 'B' ], keep = False , inplace = True )
1897
- expected = orig .loc [[0 ]]
1898
- result = df
1899
- tm .assert_frame_equal (result , expected )
1900
-
1901
- # consider everything
1902
- orig2 = orig .loc [:, ['A' , 'B' , 'C' ]].copy ()
1903
-
1904
- df2 = orig2 .copy ()
1905
- df2 .drop_duplicates (inplace = True )
1906
- # in this case only
1907
- expected = orig2 .drop_duplicates (['A' , 'B' ])
1908
- result = df2
1909
- tm .assert_frame_equal (result , expected )
1910
-
1911
- df2 = orig2 .copy ()
1912
- df2 .drop_duplicates (keep = 'last' , inplace = True )
1913
- expected = orig2 .drop_duplicates (['A' , 'B' ], keep = 'last' )
1914
- result = df2
1915
- tm .assert_frame_equal (result , expected )
1916
-
1917
- df2 = orig2 .copy ()
1918
- df2 .drop_duplicates (keep = False , inplace = True )
1919
- expected = orig2 .drop_duplicates (['A' , 'B' ], keep = False )
1920
- result = df2
1921
- tm .assert_frame_equal (result , expected )
1922
-
1923
1545
# Rounding
1924
1546
def test_round (self ):
1925
1547
# GH 2665
0 commit comments