Skip to content

Commit c1ab38e

Browse files
committed
Merge pull request #4769 from jreback/hdf_dups2
TST: more robust testing for HDFStore dups
2 parents 57b0184 + b07e020 commit c1ab38e

File tree

3 files changed

+76
-25
lines changed

3 files changed

+76
-25
lines changed

pandas/io/pytables.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@ def func(_start, _stop):
667667
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
668668

669669
# concat and return
670-
return concat(objs, axis=axis, verify_integrity=True)
670+
return concat(objs, axis=axis, verify_integrity=True).consolidate()
671671

672672
if iterator or chunksize is not None:
673673
return TableIterator(self, func, nrows=nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close)
@@ -2910,9 +2910,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
29102910

29112911
# reindex by our non_index_axes & compute data_columns
29122912
for a in self.non_index_axes:
2913-
labels = _ensure_index(a[1])
2914-
if not labels.equals(obj._get_axis(a[0])):
2915-
obj = obj.reindex_axis(labels, axis=a[0])
2913+
obj = _reindex_axis(obj, a[0], a[1])
29162914

29172915
# figure out data_columns and get out blocks
29182916
block_obj = self.get_object(obj).consolidate()
@@ -3000,11 +2998,7 @@ def process_axes(self, obj, columns=None):
30002998

30012999
# reorder by any non_index_axes & limit to the select columns
30023000
for axis, labels in self.non_index_axes:
3003-
if columns is not None:
3004-
labels = Index(labels) & Index(columns)
3005-
labels = _ensure_index(labels)
3006-
if not labels.equals(obj._get_axis(axis)):
3007-
obj = obj.reindex_axis(labels, axis=axis)
3001+
obj = _reindex_axis(obj, axis, labels, columns)
30083002

30093003
# apply the selection filters (but keep in the same order)
30103004
if self.selection.filter:
@@ -3219,7 +3213,7 @@ def read(self, where=None, columns=None, **kwargs):
32193213
if len(objs) == 1:
32203214
wp = objs[0]
32213215
else:
3222-
wp = concat(objs, axis=0, verify_integrity=False)
3216+
wp = concat(objs, axis=0, verify_integrity=False).consolidate()
32233217

32243218
# apply the selection filters & axis orderings
32253219
wp = self.process_axes(wp, columns=columns)
@@ -3510,7 +3504,7 @@ def read(self, where=None, columns=None, **kwargs):
35103504
if len(frames) == 1:
35113505
df = frames[0]
35123506
else:
3513-
df = concat(frames, axis=1, verify_integrity=False)
3507+
df = concat(frames, axis=1, verify_integrity=False).consolidate()
35143508

35153509
# apply the selection filters & axis orderings
35163510
df = self.process_axes(df, columns=columns)
@@ -3683,6 +3677,26 @@ class AppendableNDimTable(AppendablePanelTable):
36833677
obj_type = Panel4D
36843678

36853679

3680+
def _reindex_axis(obj, axis, labels, other=None):
3681+
ax = obj._get_axis(axis)
3682+
labels = _ensure_index(labels)
3683+
3684+
# try not to reindex even if other is provided
3685+
# if it equals our current index
3686+
if other is not None:
3687+
other = _ensure_index(other)
3688+
if (other is None or labels.equals(other)) and labels.equals(ax):
3689+
return obj
3690+
3691+
labels = _ensure_index(labels.unique())
3692+
if other is not None:
3693+
labels = labels & _ensure_index(other.unique())
3694+
if not labels.equals(ax):
3695+
slicer = [ slice(None, None) ] * obj.ndim
3696+
slicer[axis] = labels
3697+
obj = obj.loc[tuple(slicer)]
3698+
return obj
3699+
36863700
def _get_info(info, name):
36873701
""" get/create the info for this name """
36883702
try:

pandas/io/tests/test_pytables.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -2298,15 +2298,24 @@ def test_wide_table(self):
22982298

22992299
def test_select_with_dups(self):
23002300

2301-
23022301
# single dtypes
23032302
df = DataFrame(np.random.randn(10,4),columns=['A','A','B','B'])
23042303
df.index = date_range('20130101 9:30',periods=10,freq='T')
23052304

23062305
with ensure_clean(self.path) as store:
23072306
store.append('df',df)
2307+
23082308
result = store.select('df')
2309-
assert_frame_equal(result,df)
2309+
expected = df
2310+
assert_frame_equal(result,expected,by_blocks=True)
2311+
2312+
result = store.select('df',columns=df.columns)
2313+
expected = df
2314+
assert_frame_equal(result,expected,by_blocks=True)
2315+
2316+
result = store.select('df',columns=['A'])
2317+
expected = df.loc[:,['A']]
2318+
assert_frame_equal(result,expected)
23102319

23112320
# dups accross dtypes
23122321
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
@@ -2316,8 +2325,22 @@ def test_select_with_dups(self):
23162325

23172326
with ensure_clean(self.path) as store:
23182327
store.append('df',df)
2328+
23192329
result = store.select('df')
2320-
assert_frame_equal(result,df)
2330+
expected = df
2331+
assert_frame_equal(result,expected,by_blocks=True)
2332+
2333+
result = store.select('df',columns=df.columns)
2334+
expected = df
2335+
assert_frame_equal(result,expected,by_blocks=True)
2336+
2337+
expected = df.loc[:,['A']]
2338+
result = store.select('df',columns=['A'])
2339+
assert_frame_equal(result,expected,by_blocks=True)
2340+
2341+
expected = df.loc[:,['B','A']]
2342+
result = store.select('df',columns=['B','A'])
2343+
assert_frame_equal(result,expected,by_blocks=True)
23212344

23222345
def test_wide_table_dups(self):
23232346
wp = tm.makePanel()

pandas/util/testing.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -258,27 +258,41 @@ def assert_frame_equal(left, right, check_dtype=True,
258258
check_column_type=False,
259259
check_frame_type=False,
260260
check_less_precise=False,
261-
check_names=True):
261+
check_names=True,
262+
by_blocks=False):
262263
if check_frame_type:
263264
assert_isinstance(left, type(right))
264265
assert_isinstance(left, DataFrame)
265266
assert_isinstance(right, DataFrame)
266267

267268
if check_less_precise:
268-
assert_almost_equal(left.columns, right.columns)
269+
if not by_blocks:
270+
assert_almost_equal(left.columns, right.columns)
269271
assert_almost_equal(left.index, right.index)
270272
else:
271-
assert_index_equal(left.columns, right.columns)
273+
if not by_blocks:
274+
assert_index_equal(left.columns, right.columns)
272275
assert_index_equal(left.index, right.index)
273276

274-
for i, col in enumerate(left.columns):
275-
assert col in right
276-
lcol = left.icol(i)
277-
rcol = right.icol(i)
278-
assert_series_equal(lcol, rcol,
279-
check_dtype=check_dtype,
280-
check_index_type=check_index_type,
281-
check_less_precise=check_less_precise)
277+
# compare by blocks
278+
if by_blocks:
279+
rblocks = right.blocks
280+
lblocks = left.blocks
281+
for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
282+
assert dtype in lblocks
283+
assert dtype in rblocks
284+
assert_frame_equal(lblocks[dtype],rblocks[dtype],check_dtype=check_dtype)
285+
286+
# compare by columns
287+
else:
288+
for i, col in enumerate(left.columns):
289+
assert col in right
290+
lcol = left.icol(i)
291+
rcol = right.icol(i)
292+
assert_series_equal(lcol, rcol,
293+
check_dtype=check_dtype,
294+
check_index_type=check_index_type,
295+
check_less_precise=check_less_precise)
282296

283297
if check_index_type:
284298
assert_isinstance(left.index, type(right.index))

0 commit comments

Comments
 (0)