diff --git a/doc/source/io.rst b/doc/source/io.rst index af8bca14e5d6f..104172d9574f1 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -120,7 +120,8 @@ header : int or list of ints, default ``'infer'`` rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. + explicitly pass ``header=None``. Duplicates in this list are not allowed unless + ``mangle_dupe_cols=True``, which is the default. index_col : int or sequence or ``False``, default ``None`` Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of @@ -139,6 +140,8 @@ prefix : str, default ``None`` Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'. + Passing in False will cause data to be overwritten if there are duplicate + names in the columns. General Parsing Configuration +++++++++++++++++++++++++++++ @@ -432,6 +435,42 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. _io.dupe_names: + +Duplicate names parsing +''''''''''''''''''''''' + +If the file or header contains duplicate names, pandas by default will deduplicate +these names so as to prevent data overwrite: + +.. ipython :: python + + data = 'a,b,a\n0,1,2\n3,4,5' + pd.read_csv(StringIO(data)) + +There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies +a series of duplicate columns 'X'...'X' to become 'X.0'...'X.N'. If ``mangle_dupe_cols +=False``, duplicate data can arise: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + Out[3]: + a b a + 0 2 1 2 + 1 5 4 5 + +To prevent users from encountering this problem with duplicate data, a ``ValueError`` +exception is raised if ``mangle_dupe_cols != True``: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + ... + ValueError: Setting mangle_dupe_cols=False is not supported yet + .. _io.usecols: Filtering columns (``usecols``) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index e2e40b643ba99..2854dbf5e655b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -19,10 +19,37 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: +``pd.read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether +they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) +.. ipython :: python + data = '0,1,2\n3,4,5' + names = ['a', 'b', 'a'] + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.read_csv(StringIO(data), names=names) + Out[2]: + a b a + 0 2 1 2 + 1 5 4 5 + +The first 'a' column contains the same data as the second 'a' column, when it should have +contained the array ``[0, 3]``. + +New behaviour: + +.. ipython :: python + + In [2]: pd.read_csv(StringIO(data), names=names) .. _whatsnew_0182.enhancements.other: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 07b92fd6bfd28..c939864d7a38b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -73,7 +73,8 @@ rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you - should explicitly pass header=None + should explicitly pass header=None. Duplicates in this list are not + allowed unless mangle_dupe_cols=True, which is the default. index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end @@ -91,7 +92,9 @@ prefix : str, default None Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default True - Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' + Duplicate columns will be specified as 'X.0'...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python'). Use `str` or `object` to preserve and @@ -655,7 +658,14 @@ def _get_options_with_defaults(self, engine): options = {} for argname, default in compat.iteritems(_parser_defaults): - options[argname] = kwds.get(argname, default) + value = kwds.get(argname, default) + + # see gh-12935 + if argname == 'mangle_dupe_cols' and not value: + raise ValueError('Setting mangle_dupe_cols=False is ' + 'not supported yet') + else: + options[argname] = value for argname, default in compat.iteritems(_c_parser_defaults): if argname in kwds: @@ -899,6 +909,7 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.tupleize_cols = kwds.get('tupleize_cols', False) + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) self._date_conv = _make_date_converter( @@ -1012,6 +1023,26 @@ def tostr(x): return names, index_names, col_names, passed_names + def _maybe_dedup_names(self, names): + # see gh-7160 and gh-9424: this helps to provide + # immediate alleviation of the duplicate names + # issue and appears to be satisfactory to users, + # but ultimately, not needing to butcher the names + # would be nice! + if self.mangle_dupe_cols: + names = list(names) # so we can index + counts = {} + + for i, col in enumerate(names): + cur_count = counts.get(col, 0) + + if cur_count > 0: + names[i] = '%s.%d' % (col, cur_count) + + counts[col] = cur_count + 1 + + return names + def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if (not self.tupleize_cols and len(columns) and @@ -1314,10 +1345,11 @@ def read(self, nrows=None): except StopIteration: if self._first_chunk: self._first_chunk = False + names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - self.orig_names, self.index_col, - self.index_names, dtype=self.kwds.get('dtype')) + names, self.index_col, self.index_names, + dtype=self.kwds.get('dtype')) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1361,6 +1393,8 @@ def read(self, nrows=None): if self.usecols is not None: names = self._filter_usecols(names) + names = self._maybe_dedup_names(names) + # rename dict keys data = sorted(data.items()) data = dict((k, v) for k, (i, v) in zip(names, data)) @@ -1373,6 +1407,7 @@ def read(self, nrows=None): # ugh, mutation names = list(self.orig_names) + names = self._maybe_dedup_names(names) if self.usecols is not None: names = self._filter_usecols(names) @@ -1567,7 +1602,6 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.usecols = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] @@ -1756,8 +1790,8 @@ def read(self, rows=None): columns = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - return _get_empty_meta(self.orig_names, - self.index_col, + names = self._maybe_dedup_names(self.orig_names) + return _get_empty_meta(names, self.index_col, self.index_names) # handle new style for names in index @@ -1770,7 +1804,8 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) data = self._exclude_implicit_index(alldata) - columns, data = self._do_date_conversions(self.columns, data) + columns = self._maybe_dedup_names(self.columns) + columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) index, columns = self._make_index(data, alldata, columns, indexnamerow) @@ -1778,18 +1813,19 @@ def read(self, rows=None): return index, columns, data def _exclude_implicit_index(self, alldata): + names = self._maybe_dedup_names(self.orig_names) if self._implicit_index: excl_indices = self.index_col data = {} offset = 0 - for i, col in enumerate(self.orig_names): + for i, col in enumerate(names): while i + offset in excl_indices: offset += 1 data[col] = alldata[i + offset] else: - data = dict((k, v) for k, v in zip(self.orig_names, alldata)) + data = dict((k, v) for k, v in zip(names, alldata)) return data diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 8e44802adf744..325418f87af6a 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -293,23 +293,18 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(self): {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) - expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # FIXME in gh-9424 - raise nose.SkipTest( - "gh-9424; known failure read_csv with duplicate columns") + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one', dtype='f')], axis=1) + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) def test_usecols_dtypes(self): diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 57ab9477302c1..90a0b420eed3c 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -243,6 +243,8 @@ def test_unnamed_columns(self): 'Unnamed: 4']) def test_duplicate_columns(self): + # TODO: add test for condition 'mangle_dupe_cols=False' + # once it is actually supported (gh-12935) data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 @@ -256,11 +258,6 @@ def test_duplicate_columns(self): self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - df = getattr(self, method)(StringIO(data), sep=',', - mangle_dupe_cols=False) - self.assertEqual(list(df.columns), - ['A', 'A', 'B', 'B', 'B']) - df = getattr(self, method)(StringIO(data), sep=',', mangle_dupe_cols=True) self.assertEqual(list(df.columns), @@ -1281,3 +1278,17 @@ def test_euro_decimal_format(self): self.assertEqual(df2['Number1'].dtype, float) self.assertEqual(df2['Number2'].dtype, float) self.assertEqual(df2['Number3'].dtype, float) + + def test_read_duplicate_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + df = self.read_csv(StringIO(data)) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) + + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), names=["a", "b", "a"]) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 374485b5ddaad..ea8ce9b616f36 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -84,13 +84,6 @@ def read_table(self, *args, **kwds): class TestPythonParser(BaseParser, PythonParserTests, tm.TestCase): - """ - Class for Python parser testing. Unless specifically stated - as a PythonParser-specific issue, the goal is to eventually move - as many of these tests into ParserTests as soon as the C parser - can accept further specific arguments when parsing. - """ - engine = 'python' float_precision_choices = [None] diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 1813a95d7a306..cefe7d939d1ab 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -20,6 +20,16 @@ class TestUnsupportedFeatures(tm.TestCase): + def test_mangle_dupe_cols_false(self): + # see gh-12935 + data = 'a b c\n1 2 3' + msg = 'is not supported' + + for engine in ('c', 'python'): + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), engine=engine, + mangle_dupe_cols=False) + def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3'