Skip to content

Commit b1a7a4a

Browse files
committed
API: Warn about dups in names for read_csv
xref pandas-devgh-17095.
1 parent e8a1765 commit b1a7a4a

File tree

5 files changed

+77
-29
lines changed

5 files changed

+77
-29
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ Other API Changes
282282
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
283283
- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
284284
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
285+
- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
285286
- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
286287
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
287288
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).

pandas/io/parsers.py

+36
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,38 @@ def _validate_integer(name, val, min_val=0):
384384
return val
385385

386386

387+
def _check_dup_names(names):
388+
"""
389+
Check if the `names` parameter contains duplicates.
390+
391+
Currently, this function issues a warning if that is the case. In the
392+
future, we will raise an error.
393+
394+
Parameters
395+
----------
396+
names : array-like or None
397+
An array containing a list of the names used for the output DataFrame.
398+
"""
399+
400+
if names is None:
401+
return
402+
403+
counts = {}
404+
warn_dups = False
405+
406+
for name in names:
407+
if name in counts:
408+
warn_dups = True
409+
break
410+
411+
counts[name] = True
412+
413+
if warn_dups:
414+
msg = ("Duplicate names specified. This "
415+
"will raise an error in the future.")
416+
warnings.warn(msg, FutureWarning, stacklevel=3)
417+
418+
387419
def _read(filepath_or_buffer, kwds):
388420
"""Generic reader of line files."""
389421
encoding = kwds.get('encoding', None)
@@ -406,6 +438,10 @@ def _read(filepath_or_buffer, kwds):
406438
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
407439
nrows = _validate_integer('nrows', kwds.get('nrows', None))
408440

441+
# Check for duplicates in names.
442+
names = kwds.get("names", None)
443+
_check_dup_names(names)
444+
409445
# Create the parser.
410446
parser = TextFileReader(filepath_or_buffer, **kwds)
411447

pandas/tests/io/parser/common.py

-14
Original file line numberDiff line numberDiff line change
@@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
13571357
assert df2['Number2'].dtype == float
13581358
assert df2['Number3'].dtype == float
13591359

1360-
def test_read_duplicate_names(self):
1361-
# See gh-7160
1362-
data = "a,b,a\n0,1,2\n3,4,5"
1363-
df = self.read_csv(StringIO(data))
1364-
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
1365-
columns=['a', 'b', 'a.1'])
1366-
tm.assert_frame_equal(df, expected)
1367-
1368-
data = "0,1,2\n3,4,5"
1369-
df = self.read_csv(StringIO(data), names=["a", "b", "a"])
1370-
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
1371-
columns=['a', 'b', 'a.1'])
1372-
tm.assert_frame_equal(df, expected)
1373-
13741360
def test_inf_parsing(self):
13751361
data = """\
13761362
,A

pandas/tests/io/parser/dtypes.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
204204
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
205205
tm.assert_frame_equal(result, expected, check_index_type=False)
206206

207-
data = ''
208-
result = self.read_csv(StringIO(data), names=['one', 'one'],
209-
dtype={0: 'u1', 1: 'f'})
210-
tm.assert_frame_equal(result, expected, check_index_type=False)
207+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
208+
data = ''
209+
result = self.read_csv(StringIO(data), names=['one', 'one'],
210+
dtype={0: 'u1', 1: 'f'})
211+
tm.assert_frame_equal(result, expected, check_index_type=False)
211212

212213
def test_raise_on_passed_int_dtype_with_nas(self):
213214
# see gh-2631

pandas/tests/io/parser/mangle_dupes.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
"""
88

99
from pandas.compat import StringIO
10+
from pandas import DataFrame
11+
12+
import pandas.util.testing as tm
1013

1114

1215
class DupeColumnTests(object):
@@ -25,6 +28,21 @@ def test_basic(self):
2528
mangle_dupe_cols=True)
2629
assert list(df.columns) == expected
2730

31+
def test_basic_names(self):
32+
# See gh-7160
33+
data = "a,b,a\n0,1,2\n3,4,5"
34+
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
35+
columns=["a", "b", "a.1"])
36+
37+
df = self.read_csv(StringIO(data))
38+
tm.assert_frame_equal(df, expected)
39+
40+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
41+
data = "0,1,2\n3,4,5"
42+
df = self.read_csv(StringIO(data),
43+
names=["a", "b", "a"])
44+
tm.assert_frame_equal(df, expected)
45+
2846
def test_thorough_mangle_columns(self):
2947
# see gh-17060
3048
data = "a,a,a.1\n1,2,3"
@@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
4563
# see gh-17095
4664
data = "a,b,b\n1,2,3"
4765
names = ["a.1", "a.1", "a.1.1"]
48-
df = self.read_csv(StringIO(data), sep=",", names=names,
49-
mangle_dupe_cols=True)
50-
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
66+
67+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
68+
df = self.read_csv(StringIO(data), sep=",", names=names,
69+
mangle_dupe_cols=True)
70+
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
5171

5272
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
5373
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
54-
df = self.read_csv(StringIO(data), sep=",", names=names,
55-
mangle_dupe_cols=True)
56-
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
57-
"a.1.1.1.1", "a.1.1.1.1.1"]
74+
75+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
76+
df = self.read_csv(StringIO(data), sep=",", names=names,
77+
mangle_dupe_cols=True)
78+
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
79+
"a.1.1.1.1", "a.1.1.1.1.1"]
5880

5981
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
6082
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
61-
df = self.read_csv(StringIO(data), sep=",", names=names,
62-
mangle_dupe_cols=True)
63-
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
64-
"a.2", "a.2.1", "a.3.1"]
83+
84+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
85+
df = self.read_csv(StringIO(data), sep=",", names=names,
86+
mangle_dupe_cols=True)
87+
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
88+
"a.2", "a.2.1", "a.3.1"]

0 commit comments

Comments
 (0)