Skip to content

Commit 5ca6ff5

Browse files
committed
ENH: add explicit duplicate check when creating an index in parsing functions,
further address concerns raised in GH #226
1 parent 0cc5616 commit 5ca6ff5

File tree

6 files changed

+36
-5
lines changed

6 files changed

+36
-5
lines changed

RELEASE.rst

+5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ pandas 0.4.4
1515
- Added `parse_dates` option to `read_csv` and `read_table` methods to
1616
optionally try to parse dates in the index columns
1717
- Added ability to join on multiple columns in `DataFrame.join` (GH #214)
18+
- Added private `_get_duplicates` function to `Index` for identifying
19+
duplicate values more easily
1820

1921
**API Changes**
2022

@@ -25,6 +27,9 @@ pandas 0.4.4
2527

2628
**Improvements to existing features**
2729

30+
- File parsing functions like `read_csv` and `read_table` will explicitly
31+
check if a parsed index has duplicates and raise a more helpful exception
32+
rather than deferring the check until later
2833
- Refactored merging / joining code into a tidy class and disabled unnecessary
2934
computations in the float/object case, thus getting about 10% better
3035
performance

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2125,7 +2125,7 @@ def append(self, other, ignore_index=False):
21252125
new_index = None
21262126
else:
21272127
new_index = self.index.append(other.index)
2128-
new_index._verify_integrity()
2128+
assert(new_index._verify_integrity())
21292129

21302130
if self.columns.equals(other.columns):
21312131
return self._append_same_columns(other, new_index)

pandas/core/index.py

+12
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,20 @@ def indexMap(self):
9999
return self._indexMap
100100

101101
def _verify_integrity(self):
102+
if self._indexMap is None:
103+
try:
104+
self.indexMap
105+
except Exception:
106+
return False
102107
return len(self.indexMap) == len(self)
103108

109+
def _get_duplicates(self):
110+
from collections import defaultdict
111+
counter = defaultdict(lambda: 0)
112+
for k in self.values:
113+
counter[k] += 1
114+
return sorted(k for k, v in counter.iteritems() if v > 1)
115+
104116
_allDates = None
105117
def is_all_dates(self):
106118
"""

pandas/io/parsers.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -196,12 +196,17 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0,
196196
index = _try_parse_dates(index, parser=date_parser)
197197
index = Index(_maybe_convert_int(np.array(index, dtype=object)))
198198
else:
199-
index = MultiIndex.from_arrays(_maybe_convert_int_mindex(index,
200-
parse_dates, date_parser),
199+
arrays = _maybe_convert_int_mindex(index, parse_dates,
200+
date_parser)
201+
index = MultiIndex.from_arrays(arrays,
201202
names=idx_names)
202203
else:
203204
index = Index(np.arange(len(content)))
204205

206+
if not index._verify_integrity():
207+
dups = index._get_duplicates()
208+
raise Exception('Index has duplicates: %s' % str(dups))
209+
205210
if len(columns) != len(zipped_content):
206211
raise Exception('wrong number of columns')
207212

pandas/io/tests/test_parsers.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,16 @@ def test_read_table_wrong_num_columns(self):
140140
"""
141141
self.assertRaises(Exception, read_csv, StringIO(data))
142142

143-
143+
def test_read_table_duplicate_index(self):
144+
data = """index,A,B,C,D
145+
foo,2,3,4,5
146+
bar,7,8,9,10
147+
baz,12,13,14,15
148+
qux,12,13,14,15
149+
foo,12,13,14,15
150+
bar,12,13,14,15
151+
"""
152+
self.assertRaises(Exception, read_csv, StringIO(data), index_col=0)
144153

145154
def curpath():
146155
pth, _ = os.path.split(os.path.abspath(__file__))

pandas/tests/test_index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_deepcopy(self):
3333

3434
def test_duplicates(self):
3535
idx = Index([0, 0, 0])
36-
self.assertRaises(Exception, idx._verify_integrity)
36+
self.assert_(not idx._verify_integrity())
3737

3838
def test_sort(self):
3939
self.assertRaises(Exception, self.strIndex.sort)

0 commit comments

Comments
 (0)