Skip to content

Commit 4f67501

Browse files
committed
PERF: optimize memory usage for to_hdf
1 parent 7bce39b commit 4f67501

File tree

5 files changed

+70
-48
lines changed

5 files changed

+70
-48
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ Performance Improvements
514514
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
515515
- Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`)
516516
- Cythonized ``Period`` (:issue:`9440`)
517+
- Decreased memory usage on ``to_hdf`` (:issue:`9648`)
517518

518519
.. _whatsnew_0160.bug_fixes:
519520

pandas/core/generic.py

+8-9
Original file line numberDiff line numberDiff line change
@@ -2014,6 +2014,14 @@ def __setattr__(self, name, value):
20142014
#----------------------------------------------------------------------
20152015
# Consolidation of internals
20162016

2017+
def _protect_consolidate(self, f):
2018+
""" consolidate _data. if the blocks have changed, then clear the cache """
2019+
blocks_before = len(self._data.blocks)
2020+
result = f()
2021+
if len(self._data.blocks) != blocks_before:
2022+
self._clear_item_cache()
2023+
return result
2024+
20172025
def _consolidate_inplace(self):
20182026
f = lambda: self._data.consolidate()
20192027
self._data = self._protect_consolidate(f)
@@ -2038,8 +2046,6 @@ def consolidate(self, inplace=False):
20382046
else:
20392047
f = lambda: self._data.consolidate()
20402048
cons_data = self._protect_consolidate(f)
2041-
if cons_data is self._data:
2042-
cons_data = cons_data.copy()
20432049
return self._constructor(cons_data).__finalize__(self)
20442050

20452051
@property
@@ -2075,13 +2081,6 @@ def _check_inplace_setting(self, value):
20752081

20762082
return True
20772083

2078-
def _protect_consolidate(self, f):
2079-
blocks_before = len(self._data.blocks)
2080-
result = f()
2081-
if len(self._data.blocks) != blocks_before:
2082-
self._clear_item_cache()
2083-
return result
2084-
20852084
def _get_numeric_data(self):
20862085
return self._constructor(
20872086
self._data.get_numeric_data()).__finalize__(self)

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1752,7 +1752,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
17521752
if self.is_categorical_astype(dtype):
17531753
values = self.values
17541754
else:
1755-
values = np.array(self.values).astype(dtype)
1755+
values = np.asarray(self.values).astype(dtype, copy=False)
17561756

17571757
if copy:
17581758
values = values.copy()

pandas/io/pytables.py

+49-36
Original file line numberDiff line numberDiff line change
@@ -1782,13 +1782,13 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
17821782
return self.set_atom_timedelta64(block)
17831783

17841784
dtype = block.dtype.name
1785-
rvalues = block.values.ravel()
1786-
inferred_type = lib.infer_dtype(rvalues)
1785+
inferred_type = lib.infer_dtype(block.values)
17871786

17881787
if inferred_type == 'date':
17891788
raise TypeError(
17901789
"[date] is not implemented as a table column")
17911790
elif inferred_type == 'datetime':
1791+
rvalues = block.values.ravel()
17921792
if getattr(rvalues[0], 'tzinfo', None) is not None:
17931793

17941794
# if this block has more than one timezone, raise
@@ -1917,7 +1917,7 @@ def get_atom_data(self, block, kind=None):
19171917
def set_atom_data(self, block):
19181918
self.kind = block.dtype.name
19191919
self.typ = self.get_atom_data(block)
1920-
self.set_data(block.values.astype(self.typ.type))
1920+
self.set_data(block.values.astype(self.typ.type, copy=False))
19211921

19221922
def set_atom_categorical(self, block, items, info=None, values=None):
19231923
# currently only supports a 1-D categorical
@@ -2016,7 +2016,7 @@ def convert(self, values, nan_rep, encoding):
20162016

20172017
index = DatetimeIndex(
20182018
self.data.ravel(), tz='UTC').tz_convert(self.tz)
2019-
self.data = np.array(
2019+
self.data = np.asarray(
20202020
index.tolist(), dtype=object).reshape(self.data.shape)
20212021

20222022
else:
@@ -2026,14 +2026,14 @@ def convert(self, values, nan_rep, encoding):
20262026
self.data = np.asarray(self.data, dtype='m8[ns]')
20272027
elif dtype == u('date'):
20282028
try:
2029-
self.data = np.array(
2029+
self.data = np.asarray(
20302030
[date.fromordinal(v) for v in self.data], dtype=object)
20312031
except ValueError:
2032-
self.data = np.array(
2032+
self.data = np.asarray(
20332033
[date.fromtimestamp(v) for v in self.data],
20342034
dtype=object)
20352035
elif dtype == u('datetime'):
2036-
self.data = np.array(
2036+
self.data = np.asarray(
20372037
[datetime.fromtimestamp(v) for v in self.data],
20382038
dtype=object)
20392039

@@ -2048,9 +2048,9 @@ def convert(self, values, nan_rep, encoding):
20482048
else:
20492049

20502050
try:
2051-
self.data = self.data.astype(dtype)
2051+
self.data = self.data.astype(dtype, copy=False)
20522052
except:
2053-
self.data = self.data.astype('O')
2053+
self.data = self.data.astype('O', copy=False)
20542054

20552055
# convert nans / decode
20562056
if _ensure_decoded(self.kind) == u('string'):
@@ -2337,9 +2337,9 @@ def read_array(self, key):
23372337
ret = data
23382338

23392339
if dtype == u('datetime64'):
2340-
ret = np.array(ret, dtype='M8[ns]')
2340+
ret = np.asarray(ret, dtype='M8[ns]')
23412341
elif dtype == u('timedelta64'):
2342-
ret = np.array(ret, dtype='m8[ns]')
2342+
ret = np.asarray(ret, dtype='m8[ns]')
23432343

23442344
if transposed:
23452345
return ret.T
@@ -3793,7 +3793,7 @@ def write_data(self, chunksize, dropna=True):
37933793
# figure the mask: only do if we can successfully process this
37943794
# column, otherwise ignore the mask
37953795
mask = com.isnull(a.data).all(axis=0)
3796-
masks.append(mask.astype('u1'))
3796+
masks.append(mask.astype('u1', copy=False))
37973797

37983798
# consolidate masks
37993799
mask = masks[0]
@@ -3803,8 +3803,7 @@ def write_data(self, chunksize, dropna=True):
38033803

38043804
else:
38053805

3806-
mask = np.empty(nrows, dtype='u1')
3807-
mask.fill(False)
3806+
mask = None
38083807

38093808
# broadcast the indexes if needed
38103809
indexes = [a.cvalues for a in self.index_axes]
@@ -3833,12 +3832,13 @@ def write_data(self, chunksize, dropna=True):
38333832
bvalues = []
38343833
for i, v in enumerate(values):
38353834
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
3836-
bvalues.append(values[i].ravel().reshape(new_shape))
3835+
bvalues.append(values[i].reshape(new_shape))
38373836

38383837
# write the chunks
38393838
if chunksize is None:
38403839
chunksize = 100000
38413840

3841+
rows = np.empty(min(chunksize,nrows), dtype=self.dtype)
38423842
chunks = int(nrows / chunksize) + 1
38433843
for i in range(chunks):
38443844
start_i = i * chunksize
@@ -3847,11 +3847,20 @@ def write_data(self, chunksize, dropna=True):
38473847
break
38483848

38493849
self.write_data_chunk(
3850+
rows,
38503851
indexes=[a[start_i:end_i] for a in bindexes],
3851-
mask=mask[start_i:end_i],
3852+
mask=mask[start_i:end_i] if mask is not None else None,
38523853
values=[v[start_i:end_i] for v in bvalues])
38533854

3854-
def write_data_chunk(self, indexes, mask, values):
3855+
def write_data_chunk(self, rows, indexes, mask, values):
3856+
"""
3857+
Parameters
3858+
----------
3859+
rows : an empty memory space where we are putting the chunk
3860+
indexes : an array of the indexes
3861+
mask : an array of the masks
3862+
values : an array of the values
3863+
"""
38553864

38563865
# 0 len
38573866
for v in values:
@@ -3860,7 +3869,8 @@ def write_data_chunk(self, indexes, mask, values):
38603869

38613870
try:
38623871
nrows = indexes[0].shape[0]
3863-
rows = np.empty(nrows, dtype=self.dtype)
3872+
if nrows != len(rows):
3873+
rows = np.empty(nrows, dtype=self.dtype)
38643874
names = self.dtype.names
38653875
nindexes = len(indexes)
38663876

@@ -3873,7 +3883,10 @@ def write_data_chunk(self, indexes, mask, values):
38733883
rows[names[i + nindexes]] = v
38743884

38753885
# mask
3876-
rows = rows[~mask.ravel().astype(bool)]
3886+
if mask is not None:
3887+
m = ~mask.ravel().astype(bool, copy=False)
3888+
if not m.all():
3889+
rows = rows[m]
38773890

38783891
except Exception as detail:
38793892
raise Exception("cannot create row-data -> %s" % detail)
@@ -4240,14 +4253,14 @@ def _convert_index(index, encoding=None, format_type=None):
42404253
tz=getattr(index, 'tz', None),
42414254
index_name=index_name)
42424255
elif inferred_type == 'datetime':
4243-
converted = np.array([(time.mktime(v.timetuple()) +
4244-
v.microsecond / 1E6) for v in values],
4245-
dtype=np.float64)
4256+
converted = np.asarray([(time.mktime(v.timetuple()) +
4257+
v.microsecond / 1E6) for v in values],
4258+
dtype=np.float64)
42464259
return IndexCol(converted, 'datetime', _tables().Time64Col(),
42474260
index_name=index_name)
42484261
elif inferred_type == 'date':
4249-
converted = np.array([v.toordinal() for v in values],
4250-
dtype=np.int32)
4262+
converted = np.asarray([v.toordinal() for v in values],
4263+
dtype=np.int32)
42514264
return IndexCol(converted, 'date', _tables().Time32Col(),
42524265
index_name=index_name)
42534266
elif inferred_type == 'string':
@@ -4290,21 +4303,21 @@ def _unconvert_index(data, kind, encoding=None):
42904303
if kind == u('datetime64'):
42914304
index = DatetimeIndex(data)
42924305
elif kind == u('datetime'):
4293-
index = np.array([datetime.fromtimestamp(v) for v in data],
4294-
dtype=object)
4306+
index = np.asarray([datetime.fromtimestamp(v) for v in data],
4307+
dtype=object)
42954308
elif kind == u('date'):
42964309
try:
4297-
index = np.array(
4310+
index = np.asarray(
42984311
[date.fromordinal(v) for v in data], dtype=object)
42994312
except (ValueError):
4300-
index = np.array(
4313+
index = np.asarray(
43014314
[date.fromtimestamp(v) for v in data], dtype=object)
43024315
elif kind in (u('integer'), u('float')):
4303-
index = np.array(data)
4316+
index = np.asarray(data)
43044317
elif kind in (u('string')):
43054318
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
43064319
elif kind == u('object'):
4307-
index = np.array(data[0])
4320+
index = np.asarray(data[0])
43084321
else: # pragma: no cover
43094322
raise ValueError('unrecognized index type %s' % kind)
43104323
return index
@@ -4315,7 +4328,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
43154328
if kind == u('datetime'):
43164329
index = lib.time64_to_datetime(data)
43174330
elif kind in (u('integer')):
4318-
index = np.array(data, dtype=object)
4331+
index = np.asarray(data, dtype=object)
43194332
elif kind in (u('string')):
43204333
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
43214334
else: # pragma: no cover
@@ -4334,13 +4347,13 @@ def _convert_string_array(data, encoding, itemsize=None):
43344347
if itemsize is None:
43354348
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
43364349

4337-
data = np.array(data, dtype="S%d" % itemsize)
4350+
data = np.asarray(data, dtype="S%d" % itemsize)
43384351
return data
43394352

43404353
def _unconvert_string_array(data, nan_rep=None, encoding=None):
43414354
""" deserialize a string array, possibly decoding """
43424355
shape = data.shape
4343-
data = np.array(data.ravel(), dtype=object)
4356+
data = np.asarray(data.ravel(), dtype=object)
43444357

43454358
# guard against a None encoding in PY3 (because of a legacy
43464359
# where the passed encoding is actually None)
@@ -4353,7 +4366,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
43534366
dtype = "U{0}".format(itemsize)
43544367
else:
43554368
dtype = "S{0}".format(itemsize)
4356-
data = data.astype(dtype).astype(object)
4369+
data = data.astype(dtype, copy=False).astype(object, copy=False)
43574370
except (Exception) as e:
43584371
f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
43594372
data = f(data)
@@ -4376,7 +4389,7 @@ def _maybe_convert(values, val_kind, encoding):
43764389
def _get_converter(kind, encoding):
43774390
kind = _ensure_decoded(kind)
43784391
if kind == 'datetime64':
4379-
return lambda x: np.array(x, dtype='M8[ns]')
4392+
return lambda x: np.asarray(x, dtype='M8[ns]')
43804393
elif kind == 'datetime':
43814394
return lib.convert_timestamps
43824395
elif kind == 'string':
@@ -4421,7 +4434,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
44214434
try:
44224435
inferred = lib.infer_dtype(where)
44234436
if inferred == 'integer' or inferred == 'boolean':
4424-
where = np.array(where)
4437+
where = np.asarray(where)
44254438
if where.dtype == np.bool_:
44264439
start, stop = self.start, self.stop
44274440
if start is None:

pandas/io/tests/test_pytables.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -4593,12 +4593,17 @@ def test_categorical(self):
45934593

45944594
with ensure_clean_store(self.path) as store:
45954595

4596-
s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d']))
4597-
4596+
# basic
4597+
s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False))
45984598
store.append('s', s, format='table')
45994599
result = store.select('s')
46004600
tm.assert_series_equal(s, result)
46014601

4602+
s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True))
4603+
store.append('s_ordered', s, format='table')
4604+
result = store.select('s_ordered')
4605+
tm.assert_series_equal(s, result)
4606+
46024607
df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]})
46034608
store.append('df', df, format='table')
46044609
result = store.select('df')
@@ -4639,6 +4644,10 @@ def test_categorical(self):
46394644
result = store.select('df3', where = ['s in ["b","c"]'])
46404645
tm.assert_frame_equal(result, expected)
46414646

4647+
expected = df[df.s.isin(['b','c'])]
4648+
result = store.select('df3', where = ['s = ["b","c"]'])
4649+
tm.assert_frame_equal(result, expected)
4650+
46424651
expected = df[df.s.isin(['d'])]
46434652
result = store.select('df3', where = ['s in ["d"]'])
46444653
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)