Skip to content

Commit 113e7d8

Browse files
committed
Merge pull request #4462 from jreback/hdf_opt2
BUG: bug when using chunksize and writing ndim > 2
2 parents 8915ce6 + 3a979ac commit 113e7d8

File tree

2 files changed

+58
-29
lines changed

2 files changed

+58
-29
lines changed

pandas/io/pytables.py

+35-29
Original file line numberDiff line numberDiff line change
@@ -3037,7 +3037,11 @@ def write(self, obj, axes=None, append=False, complib=None,
30373037
self.write_data(chunksize)
30383038

30393039
def write_data(self, chunksize):
3040-
""" fast writing of data: requires specific cython routines each axis shape """
3040+
""" we form the data into a 2-d including indexes,values,mask
3041+
write chunk-by-chunk """
3042+
3043+
names = self.dtype.names
3044+
nrows = self.nrows_expected
30413045

30423046
# create the masks & values
30433047
masks = []
@@ -3052,30 +3056,49 @@ def write_data(self, chunksize):
30523056
mask = masks[0]
30533057
for m in masks[1:]:
30543058
mask = mask & m
3059+
mask = mask.ravel()
3060+
3061+
# broadcast the indexes if needed
3062+
indexes = [ a.cvalues for a in self.index_axes ]
3063+
nindexes = len(indexes)
3064+
bindexes = []
3065+
for i, idx in enumerate(indexes):
3066+
3067+
# broadcast to all other indexes except myself
3068+
if i > 0 and i < nindexes:
3069+
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
3070+
idx = np.tile(idx,repeater)
30553071

3056-
# the arguments
3057-
indexes = [a.cvalues for a in self.index_axes]
3058-
values = [a.take_data() for a in self.values_axes]
3072+
if i < nindexes-1:
3073+
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
3074+
idx = np.repeat(idx,repeater)
3075+
3076+
bindexes.append(idx)
30593077

30603078
# transpose the values so first dimension is last
3079+
# reshape the values if needed
3080+
values = [ a.take_data() for a in self.values_axes]
30613081
values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ]
3082+
bvalues = []
3083+
for i, v in enumerate(values):
3084+
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
3085+
bvalues.append(values[i].ravel().reshape(new_shape))
30623086

30633087
# write the chunks
30643088
if chunksize is None:
30653089
chunksize = 100000
30663090

3067-
rows = self.nrows_expected
3068-
chunks = int(rows / chunksize) + 1
3091+
chunks = int(nrows / chunksize) + 1
30693092
for i in range(chunks):
30703093
start_i = i * chunksize
3071-
end_i = min((i + 1) * chunksize, rows)
3094+
end_i = min((i + 1) * chunksize, nrows)
30723095
if start_i >= end_i:
30733096
break
30743097

30753098
self.write_data_chunk(
3076-
indexes=[a[start_i:end_i] for a in indexes],
3099+
indexes=[a[start_i:end_i] for a in bindexes],
30773100
mask=mask[start_i:end_i],
3078-
values=[v[start_i:end_i] for v in values])
3101+
values=[v[start_i:end_i] for v in bvalues])
30793102

30803103
def write_data_chunk(self, indexes, mask, values):
30813104

@@ -3085,35 +3108,18 @@ def write_data_chunk(self, indexes, mask, values):
30853108
return
30863109

30873110
try:
3088-
nrows = np.prod([ idx.shape[0] for idx in indexes ])
3111+
nrows = indexes[0].shape[0]
30893112
rows = np.empty(nrows,dtype=self.dtype)
30903113
names = self.dtype.names
3114+
nindexes = len(indexes)
30913115

30923116
# indexes
3093-
nindexes = len(indexes)
30943117
for i, idx in enumerate(indexes):
3095-
3096-
# broadcast to all other indexes except myself
3097-
if i > 0 and i < nindexes:
3098-
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
3099-
idx = np.tile(idx,repeater)
3100-
3101-
if i < nindexes-1:
3102-
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
3103-
idx = np.repeat(idx,repeater)
3104-
31053118
rows[names[i]] = idx
31063119

31073120
# values
31083121
for i, v in enumerate(values):
3109-
name = names[nindexes + i]
3110-
b = values[i]
3111-
3112-
# reshape
3113-
new_shape = (nrows,) + self.dtype[name].shape
3114-
b = b.ravel().reshape(new_shape)
3115-
3116-
rows[name] = b
3122+
rows[names[i+nindexes]] = v
31173123

31183124
# mask
31193125
rows = rows[~mask.ravel().astype(bool)]

pandas/io/tests/test_pytables.py

+23
Original file line numberDiff line numberDiff line change
@@ -1237,6 +1237,29 @@ def test_append_misc(self):
12371237
result = store.select('df1')
12381238
tm.assert_frame_equal(result, df)
12391239

1240+
# more chunksize in append tests
1241+
def check(obj, comparator):
1242+
for c in [10, 200, 1000]:
1243+
with ensure_clean(self.path,mode='w') as store:
1244+
store.append('obj', obj, chunksize=c)
1245+
result = store.select('obj')
1246+
comparator(result,obj)
1247+
1248+
df = tm.makeDataFrame()
1249+
df['string'] = 'foo'
1250+
df['float322'] = 1.
1251+
df['float322'] = df['float322'].astype('float32')
1252+
df['bool'] = df['float322'] > 0
1253+
df['time1'] = Timestamp('20130101')
1254+
df['time2'] = Timestamp('20130102')
1255+
check(df, tm.assert_frame_equal)
1256+
1257+
p = tm.makePanel()
1258+
check(p, tm.assert_panel_equal)
1259+
1260+
p4d = tm.makePanel4D()
1261+
check(p4d, tm.assert_panel4d_equal)
1262+
12401263
def test_append_raise(self):
12411264

12421265
with ensure_clean(self.path) as store:

0 commit comments

Comments
 (0)