73
73
rather than the first line of the file.
74
74
names : array-like, default None
75
75
List of column names to use. If file contains no header row, then you
76
- should explicitly pass header=None
76
+ should explicitly pass header=None. Duplicates in this list are not
77
+ allowed unless mangle_dupe_cols=True, which is the default.
77
78
index_col : int or sequence or False, default None
78
79
Column to use as the row labels of the DataFrame. If a sequence is given, a
79
80
MultiIndex is used. If you have a malformed file with delimiters at the end
91
92
prefix : str, default None
92
93
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
93
94
mangle_dupe_cols : boolean, default True
94
- Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
95
+ Duplicate columns will be specified as 'X.0'...'X.N', rather than
96
+ 'X'...'X'. Passing in False will cause data to be overwritten if there
97
+ are duplicate names in the columns.
95
98
dtype : Type name or dict of column -> type, default None
96
99
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
97
100
(Unsupported with engine='python'). Use `str` or `object` to preserve and
@@ -655,7 +658,14 @@ def _get_options_with_defaults(self, engine):
655
658
options = {}
656
659
657
660
for argname , default in compat .iteritems (_parser_defaults ):
658
- options [argname ] = kwds .get (argname , default )
661
+ value = kwds .get (argname , default )
662
+
663
+ # see gh-12935
664
+ if argname == 'mangle_dupe_cols' and not value :
665
+ raise ValueError ('Setting mangle_dupe_cols=False is '
666
+ 'not supported yet' )
667
+ else :
668
+ options [argname ] = value
659
669
660
670
for argname , default in compat .iteritems (_c_parser_defaults ):
661
671
if argname in kwds :
@@ -899,6 +909,7 @@ def __init__(self, kwds):
899
909
self .true_values = kwds .get ('true_values' )
900
910
self .false_values = kwds .get ('false_values' )
901
911
self .tupleize_cols = kwds .get ('tupleize_cols' , False )
912
+ self .mangle_dupe_cols = kwds .get ('mangle_dupe_cols' , True )
902
913
self .infer_datetime_format = kwds .pop ('infer_datetime_format' , False )
903
914
904
915
self ._date_conv = _make_date_converter (
@@ -1012,6 +1023,26 @@ def tostr(x):
1012
1023
1013
1024
return names , index_names , col_names , passed_names
1014
1025
1026
+ def _maybe_dedup_names (self , names ):
1027
+ # see gh-7160 and gh-9424: this helps to provide
1028
+ # immediate alleviation of the duplicate names
1029
+ # issue and appears to be satisfactory to users,
1030
+ # but ultimately, not needing to butcher the names
1031
+ # would be nice!
1032
+ if self .mangle_dupe_cols :
1033
+ names = list (names ) # so we can index
1034
+ counts = {}
1035
+
1036
+ for i , col in enumerate (names ):
1037
+ cur_count = counts .get (col , 0 )
1038
+
1039
+ if cur_count > 0 :
1040
+ names [i ] = '%s.%d' % (col , cur_count )
1041
+
1042
+ counts [col ] = cur_count + 1
1043
+
1044
+ return names
1045
+
1015
1046
def _maybe_make_multi_index_columns (self , columns , col_names = None ):
1016
1047
# possibly create a column mi here
1017
1048
if (not self .tupleize_cols and len (columns ) and
@@ -1314,10 +1345,11 @@ def read(self, nrows=None):
1314
1345
except StopIteration :
1315
1346
if self ._first_chunk :
1316
1347
self ._first_chunk = False
1348
+ names = self ._maybe_dedup_names (self .orig_names )
1317
1349
1318
1350
index , columns , col_dict = _get_empty_meta (
1319
- self .orig_names , self .index_col ,
1320
- self . index_names , dtype = self .kwds .get ('dtype' ))
1351
+ names , self .index_col , self .index_names ,
1352
+ dtype = self .kwds .get ('dtype' ))
1321
1353
1322
1354
if self .usecols is not None :
1323
1355
columns = self ._filter_usecols (columns )
@@ -1361,6 +1393,8 @@ def read(self, nrows=None):
1361
1393
if self .usecols is not None :
1362
1394
names = self ._filter_usecols (names )
1363
1395
1396
+ names = self ._maybe_dedup_names (names )
1397
+
1364
1398
# rename dict keys
1365
1399
data = sorted (data .items ())
1366
1400
data = dict ((k , v ) for k , (i , v ) in zip (names , data ))
@@ -1373,6 +1407,7 @@ def read(self, nrows=None):
1373
1407
1374
1408
# ugh, mutation
1375
1409
names = list (self .orig_names )
1410
+ names = self ._maybe_dedup_names (names )
1376
1411
1377
1412
if self .usecols is not None :
1378
1413
names = self ._filter_usecols (names )
@@ -1567,7 +1602,6 @@ def __init__(self, f, **kwds):
1567
1602
self .skipinitialspace = kwds ['skipinitialspace' ]
1568
1603
self .lineterminator = kwds ['lineterminator' ]
1569
1604
self .quoting = kwds ['quoting' ]
1570
- self .mangle_dupe_cols = kwds .get ('mangle_dupe_cols' , True )
1571
1605
self .usecols = _validate_usecols_arg (kwds ['usecols' ])
1572
1606
self .skip_blank_lines = kwds ['skip_blank_lines' ]
1573
1607
@@ -1756,8 +1790,8 @@ def read(self, rows=None):
1756
1790
columns = list (self .orig_names )
1757
1791
if not len (content ): # pragma: no cover
1758
1792
# DataFrame with the right metadata, even though it's length 0
1759
- return _get_empty_meta (self .orig_names ,
1760
- self .index_col ,
1793
+ names = self . _maybe_dedup_names (self .orig_names )
1794
+ return _get_empty_meta ( names , self .index_col ,
1761
1795
self .index_names )
1762
1796
1763
1797
# handle new style for names in index
@@ -1770,26 +1804,28 @@ def read(self, rows=None):
1770
1804
alldata = self ._rows_to_cols (content )
1771
1805
data = self ._exclude_implicit_index (alldata )
1772
1806
1773
- columns , data = self ._do_date_conversions (self .columns , data )
1807
+ columns = self ._maybe_dedup_names (self .columns )
1808
+ columns , data = self ._do_date_conversions (columns , data )
1774
1809
1775
1810
data = self ._convert_data (data )
1776
1811
index , columns = self ._make_index (data , alldata , columns , indexnamerow )
1777
1812
1778
1813
return index , columns , data
1779
1814
1780
1815
def _exclude_implicit_index (self , alldata ):
1816
+ names = self ._maybe_dedup_names (self .orig_names )
1781
1817
1782
1818
if self ._implicit_index :
1783
1819
excl_indices = self .index_col
1784
1820
1785
1821
data = {}
1786
1822
offset = 0
1787
- for i , col in enumerate (self . orig_names ):
1823
+ for i , col in enumerate (names ):
1788
1824
while i + offset in excl_indices :
1789
1825
offset += 1
1790
1826
data [col ] = alldata [i + offset ]
1791
1827
else :
1792
- data = dict ((k , v ) for k , v in zip (self . orig_names , alldata ))
1828
+ data = dict ((k , v ) for k , v in zip (names , alldata ))
1793
1829
1794
1830
return data
1795
1831
0 commit comments