Skip to content

PERF: optimize memory usage for to_hdf #9648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 16, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,7 @@ Performance Improvements
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
- Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`)
- Cythonized ``Period`` (:issue:`9440`)
- Decreased memory usage on ``to_hdf`` (:issue:`9648`)

.. _whatsnew_0160.bug_fixes:

Expand Down
17 changes: 8 additions & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2014,6 +2014,14 @@ def __setattr__(self, name, value):
#----------------------------------------------------------------------
# Consolidation of internals

def _protect_consolidate(self, f):
""" consolidate _data. if the blocks have changed, then clear the cache """
blocks_before = len(self._data.blocks)
result = f()
if len(self._data.blocks) != blocks_before:
self._clear_item_cache()
return result

def _consolidate_inplace(self):
f = lambda: self._data.consolidate()
self._data = self._protect_consolidate(f)
Expand All @@ -2038,8 +2046,6 @@ def consolidate(self, inplace=False):
else:
f = lambda: self._data.consolidate()
cons_data = self._protect_consolidate(f)
if cons_data is self._data:
cons_data = cons_data.copy()
return self._constructor(cons_data).__finalize__(self)

@property
Expand Down Expand Up @@ -2075,13 +2081,6 @@ def _check_inplace_setting(self, value):

return True

def _protect_consolidate(self, f):
blocks_before = len(self._data.blocks)
result = f()
if len(self._data.blocks) != blocks_before:
self._clear_item_cache()
return result

def _get_numeric_data(self):
return self._constructor(
self._data.get_numeric_data()).__finalize__(self)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1752,7 +1752,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
if self.is_categorical_astype(dtype):
values = self.values
else:
values = np.array(self.values).astype(dtype)
values = np.asarray(self.values).astype(dtype, copy=False)

if copy:
values = values.copy()
Expand Down
85 changes: 49 additions & 36 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,13 +1782,13 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
return self.set_atom_timedelta64(block)

dtype = block.dtype.name
rvalues = block.values.ravel()
inferred_type = lib.infer_dtype(rvalues)
inferred_type = lib.infer_dtype(block.values)

if inferred_type == 'date':
raise TypeError(
"[date] is not implemented as a table column")
elif inferred_type == 'datetime':
rvalues = block.values.ravel()
if getattr(rvalues[0], 'tzinfo', None) is not None:

# if this block has more than one timezone, raise
Expand Down Expand Up @@ -1917,7 +1917,7 @@ def get_atom_data(self, block, kind=None):
def set_atom_data(self, block):
self.kind = block.dtype.name
self.typ = self.get_atom_data(block)
self.set_data(block.values.astype(self.typ.type))
self.set_data(block.values.astype(self.typ.type, copy=False))

def set_atom_categorical(self, block, items, info=None, values=None):
# currently only supports a 1-D categorical
Expand Down Expand Up @@ -2016,7 +2016,7 @@ def convert(self, values, nan_rep, encoding):

index = DatetimeIndex(
self.data.ravel(), tz='UTC').tz_convert(self.tz)
self.data = np.array(
self.data = np.asarray(
index.tolist(), dtype=object).reshape(self.data.shape)

else:
Expand All @@ -2026,14 +2026,14 @@ def convert(self, values, nan_rep, encoding):
self.data = np.asarray(self.data, dtype='m8[ns]')
elif dtype == u('date'):
try:
self.data = np.array(
self.data = np.asarray(
[date.fromordinal(v) for v in self.data], dtype=object)
except ValueError:
self.data = np.array(
self.data = np.asarray(
[date.fromtimestamp(v) for v in self.data],
dtype=object)
elif dtype == u('datetime'):
self.data = np.array(
self.data = np.asarray(
[datetime.fromtimestamp(v) for v in self.data],
dtype=object)

Expand All @@ -2048,9 +2048,9 @@ def convert(self, values, nan_rep, encoding):
else:

try:
self.data = self.data.astype(dtype)
self.data = self.data.astype(dtype, copy=False)
except:
self.data = self.data.astype('O')
self.data = self.data.astype('O', copy=False)

# convert nans / decode
if _ensure_decoded(self.kind) == u('string'):
Expand Down Expand Up @@ -2337,9 +2337,9 @@ def read_array(self, key):
ret = data

if dtype == u('datetime64'):
ret = np.array(ret, dtype='M8[ns]')
ret = np.asarray(ret, dtype='M8[ns]')
elif dtype == u('timedelta64'):
ret = np.array(ret, dtype='m8[ns]')
ret = np.asarray(ret, dtype='m8[ns]')

if transposed:
return ret.T
Expand Down Expand Up @@ -3793,7 +3793,7 @@ def write_data(self, chunksize, dropna=True):
# figure the mask: only do if we can successfully process this
# column, otherwise ignore the mask
mask = com.isnull(a.data).all(axis=0)
masks.append(mask.astype('u1'))
masks.append(mask.astype('u1', copy=False))

# consolidate masks
mask = masks[0]
Expand All @@ -3803,8 +3803,7 @@ def write_data(self, chunksize, dropna=True):

else:

mask = np.empty(nrows, dtype='u1')
mask.fill(False)
mask = None

# broadcast the indexes if needed
indexes = [a.cvalues for a in self.index_axes]
Expand Down Expand Up @@ -3833,12 +3832,13 @@ def write_data(self, chunksize, dropna=True):
bvalues = []
for i, v in enumerate(values):
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
bvalues.append(values[i].ravel().reshape(new_shape))
bvalues.append(values[i].reshape(new_shape))

# write the chunks
if chunksize is None:
chunksize = 100000

rows = np.empty(min(chunksize,nrows), dtype=self.dtype)
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
Expand All @@ -3847,11 +3847,20 @@ def write_data(self, chunksize, dropna=True):
break

self.write_data_chunk(
rows,
indexes=[a[start_i:end_i] for a in bindexes],
mask=mask[start_i:end_i],
mask=mask[start_i:end_i] if mask is not None else None,
values=[v[start_i:end_i] for v in bvalues])

def write_data_chunk(self, indexes, mask, values):
def write_data_chunk(self, rows, indexes, mask, values):
"""
Parameters
----------
rows : an empty memory space where we are putting the chunk
indexes : an array of the indexes
mask : an array of the masks
values : an array of the values
"""

# 0 len
for v in values:
Expand All @@ -3860,7 +3869,8 @@ def write_data_chunk(self, indexes, mask, values):

try:
nrows = indexes[0].shape[0]
rows = np.empty(nrows, dtype=self.dtype)
if nrows != len(rows):
rows = np.empty(nrows, dtype=self.dtype)
names = self.dtype.names
nindexes = len(indexes)

Expand All @@ -3873,7 +3883,10 @@ def write_data_chunk(self, indexes, mask, values):
rows[names[i + nindexes]] = v

# mask
rows = rows[~mask.ravel().astype(bool)]
if mask is not None:
m = ~mask.ravel().astype(bool, copy=False)
if not m.all():
rows = rows[m]

except Exception as detail:
raise Exception("cannot create row-data -> %s" % detail)
Expand Down Expand Up @@ -4240,14 +4253,14 @@ def _convert_index(index, encoding=None, format_type=None):
tz=getattr(index, 'tz', None),
index_name=index_name)
elif inferred_type == 'datetime':
converted = np.array([(time.mktime(v.timetuple()) +
v.microsecond / 1E6) for v in values],
dtype=np.float64)
converted = np.asarray([(time.mktime(v.timetuple()) +
v.microsecond / 1E6) for v in values],
dtype=np.float64)
return IndexCol(converted, 'datetime', _tables().Time64Col(),
index_name=index_name)
elif inferred_type == 'date':
converted = np.array([v.toordinal() for v in values],
dtype=np.int32)
converted = np.asarray([v.toordinal() for v in values],
dtype=np.int32)
return IndexCol(converted, 'date', _tables().Time32Col(),
index_name=index_name)
elif inferred_type == 'string':
Expand Down Expand Up @@ -4290,21 +4303,21 @@ def _unconvert_index(data, kind, encoding=None):
if kind == u('datetime64'):
index = DatetimeIndex(data)
elif kind == u('datetime'):
index = np.array([datetime.fromtimestamp(v) for v in data],
dtype=object)
index = np.asarray([datetime.fromtimestamp(v) for v in data],
dtype=object)
elif kind == u('date'):
try:
index = np.array(
index = np.asarray(
[date.fromordinal(v) for v in data], dtype=object)
except (ValueError):
index = np.array(
index = np.asarray(
[date.fromtimestamp(v) for v in data], dtype=object)
elif kind in (u('integer'), u('float')):
index = np.array(data)
index = np.asarray(data)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
elif kind == u('object'):
index = np.array(data[0])
index = np.asarray(data[0])
else: # pragma: no cover
raise ValueError('unrecognized index type %s' % kind)
return index
Expand All @@ -4315,7 +4328,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
if kind == u('datetime'):
index = lib.time64_to_datetime(data)
elif kind in (u('integer')):
index = np.array(data, dtype=object)
index = np.asarray(data, dtype=object)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
else: # pragma: no cover
Expand All @@ -4334,13 +4347,13 @@ def _convert_string_array(data, encoding, itemsize=None):
if itemsize is None:
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))

data = np.array(data, dtype="S%d" % itemsize)
data = np.asarray(data, dtype="S%d" % itemsize)
return data

def _unconvert_string_array(data, nan_rep=None, encoding=None):
""" deserialize a string array, possibly decoding """
shape = data.shape
data = np.array(data.ravel(), dtype=object)
data = np.asarray(data.ravel(), dtype=object)

# guard against a None encoding in PY3 (because of a legacy
# where the passed encoding is actually None)
Expand All @@ -4353,7 +4366,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
dtype = "U{0}".format(itemsize)
else:
dtype = "S{0}".format(itemsize)
data = data.astype(dtype).astype(object)
data = data.astype(dtype, copy=False).astype(object, copy=False)
except (Exception) as e:
f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
data = f(data)
Expand All @@ -4376,7 +4389,7 @@ def _maybe_convert(values, val_kind, encoding):
def _get_converter(kind, encoding):
kind = _ensure_decoded(kind)
if kind == 'datetime64':
return lambda x: np.array(x, dtype='M8[ns]')
return lambda x: np.asarray(x, dtype='M8[ns]')
elif kind == 'datetime':
return lib.convert_timestamps
elif kind == 'string':
Expand Down Expand Up @@ -4421,7 +4434,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
try:
inferred = lib.infer_dtype(where)
if inferred == 'integer' or inferred == 'boolean':
where = np.array(where)
where = np.asarray(where)
if where.dtype == np.bool_:
start, stop = self.start, self.stop
if start is None:
Expand Down
13 changes: 11 additions & 2 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4593,12 +4593,17 @@ def test_categorical(self):

with ensure_clean_store(self.path) as store:

s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d']))

# basic
s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False))
store.append('s', s, format='table')
result = store.select('s')
tm.assert_series_equal(s, result)

s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True))
store.append('s_ordered', s, format='table')
result = store.select('s_ordered')
tm.assert_series_equal(s, result)

df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]})
store.append('df', df, format='table')
result = store.select('df')
Expand Down Expand Up @@ -4639,6 +4644,10 @@ def test_categorical(self):
result = store.select('df3', where = ['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)

expected = df[df.s.isin(['b','c'])]
result = store.select('df3', where = ['s = ["b","c"]'])
tm.assert_frame_equal(result, expected)

expected = df[df.s.isin(['d'])]
result = store.select('df3', where = ['s in ["d"]'])
tm.assert_frame_equal(result, expected)
Expand Down