Skip to content

Commit 2ada940

Browse files
committed
API: Warn about dups in names for read_csv
xref pandas-devgh-17095.
1 parent 4004367 commit 2ada940

File tree

5 files changed

+71
-29
lines changed

5 files changed

+71
-29
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ Other API Changes
422422
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
423423
- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
424424
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
425+
- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
425426
- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
426427
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
427428
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).

pandas/io/parsers.py

+30
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,33 @@ def _validate_integer(name, val, min_val=0):
385385
return val
386386

387387

388+
def _validate_names(names):
389+
"""
390+
Check if the `names` parameter contains duplicates.
391+
392+
Currently, this function issues a warning if that is the case. In the
393+
future, we will raise an error.
394+
395+
Parameters
396+
----------
397+
names : array-like or None
398+
An array containing a list of the names used for the output DataFrame.
399+
400+
Returns
401+
-------
402+
names : array-like or None
403+
The original `names` parameter.
404+
"""
405+
406+
if names is not None:
407+
if len(names) != len(set(names)):
408+
msg = ("Duplicate names specified. This "
409+
"will raise an error in the future.")
410+
warnings.warn(msg, UserWarning, stacklevel=3)
411+
412+
return names
413+
414+
388415
def _read(filepath_or_buffer, kwds):
389416
"""Generic reader of line files."""
390417
encoding = kwds.get('encoding', None)
@@ -407,6 +434,9 @@ def _read(filepath_or_buffer, kwds):
407434
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
408435
nrows = _validate_integer('nrows', kwds.get('nrows', None))
409436

437+
# Check for duplicates in names.
438+
_validate_names(kwds.get("names", None))
439+
410440
# Create the parser.
411441
parser = TextFileReader(filepath_or_buffer, **kwds)
412442

pandas/tests/io/parser/common.py

-14
Original file line numberDiff line numberDiff line change
@@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
13571357
assert df2['Number2'].dtype == float
13581358
assert df2['Number3'].dtype == float
13591359

1360-
def test_read_duplicate_names(self):
1361-
# See gh-7160
1362-
data = "a,b,a\n0,1,2\n3,4,5"
1363-
df = self.read_csv(StringIO(data))
1364-
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
1365-
columns=['a', 'b', 'a.1'])
1366-
tm.assert_frame_equal(df, expected)
1367-
1368-
data = "0,1,2\n3,4,5"
1369-
df = self.read_csv(StringIO(data), names=["a", "b", "a"])
1370-
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
1371-
columns=['a', 'b', 'a.1'])
1372-
tm.assert_frame_equal(df, expected)
1373-
13741360
def test_inf_parsing(self):
13751361
data = """\
13761362
,A

pandas/tests/io/parser/dtypes.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
204204
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
205205
tm.assert_frame_equal(result, expected, check_index_type=False)
206206

207-
data = ''
208-
result = self.read_csv(StringIO(data), names=['one', 'one'],
209-
dtype={0: 'u1', 1: 'f'})
210-
tm.assert_frame_equal(result, expected, check_index_type=False)
207+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
208+
data = ''
209+
result = self.read_csv(StringIO(data), names=['one', 'one'],
210+
dtype={0: 'u1', 1: 'f'})
211+
tm.assert_frame_equal(result, expected, check_index_type=False)
211212

212213
def test_raise_on_passed_int_dtype_with_nas(self):
213214
# see gh-2631

pandas/tests/io/parser/mangle_dupes.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
"""
88

99
from pandas.compat import StringIO
10+
from pandas import DataFrame
11+
12+
import pandas.util.testing as tm
1013

1114

1215
class DupeColumnTests(object):
@@ -25,6 +28,21 @@ def test_basic(self):
2528
mangle_dupe_cols=True)
2629
assert list(df.columns) == expected
2730

31+
def test_basic_names(self):
32+
# See gh-7160
33+
data = "a,b,a\n0,1,2\n3,4,5"
34+
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
35+
columns=["a", "b", "a.1"])
36+
37+
df = self.read_csv(StringIO(data))
38+
tm.assert_frame_equal(df, expected)
39+
40+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
41+
data = "0,1,2\n3,4,5"
42+
df = self.read_csv(StringIO(data),
43+
names=["a", "b", "a"])
44+
tm.assert_frame_equal(df, expected)
45+
2846
def test_thorough_mangle_columns(self):
2947
# see gh-17060
3048
data = "a,a,a.1\n1,2,3"
@@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
4563
# see gh-17095
4664
data = "a,b,b\n1,2,3"
4765
names = ["a.1", "a.1", "a.1.1"]
48-
df = self.read_csv(StringIO(data), sep=",", names=names,
49-
mangle_dupe_cols=True)
50-
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
66+
67+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
68+
df = self.read_csv(StringIO(data), sep=",", names=names,
69+
mangle_dupe_cols=True)
70+
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
5171

5272
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
5373
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
54-
df = self.read_csv(StringIO(data), sep=",", names=names,
55-
mangle_dupe_cols=True)
56-
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
57-
"a.1.1.1.1", "a.1.1.1.1.1"]
74+
75+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
76+
df = self.read_csv(StringIO(data), sep=",", names=names,
77+
mangle_dupe_cols=True)
78+
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
79+
"a.1.1.1.1", "a.1.1.1.1.1"]
5880

5981
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
6082
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
61-
df = self.read_csv(StringIO(data), sep=",", names=names,
62-
mangle_dupe_cols=True)
63-
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
64-
"a.2", "a.2.1", "a.3.1"]
83+
84+
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
85+
df = self.read_csv(StringIO(data), sep=",", names=names,
86+
mangle_dupe_cols=True)
87+
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
88+
"a.2", "a.2.1", "a.3.1"]

0 commit comments

Comments
 (0)