diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 4790017727005..54ef3c893f355 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -514,6 +514,7 @@ Performance Improvements - Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`) - Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`) - Cythonized ``Period`` (:issue:`9440`) +- Decreased memory usage on ``to_hdf`` (:issue:`9648`) .. _whatsnew_0160.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4029aaec78a4a..e05709d7a180f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2014,6 +2014,14 @@ def __setattr__(self, name, value): #---------------------------------------------------------------------- # Consolidation of internals + def _protect_consolidate(self, f): + """ consolidate _data. if the blocks have changed, then clear the cache """ + blocks_before = len(self._data.blocks) + result = f() + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return result + def _consolidate_inplace(self): f = lambda: self._data.consolidate() self._data = self._protect_consolidate(f) @@ -2038,8 +2046,6 @@ def consolidate(self, inplace=False): else: f = lambda: self._data.consolidate() cons_data = self._protect_consolidate(f) - if cons_data is self._data: - cons_data = cons_data.copy() return self._constructor(cons_data).__finalize__(self) @property @@ -2075,13 +2081,6 @@ def _check_inplace_setting(self, value): return True - def _protect_consolidate(self, f): - blocks_before = len(self._data.blocks) - result = f() - if len(self._data.blocks) != blocks_before: - self._clear_item_cache() - return result - def _get_numeric_data(self): return self._constructor( self._data.get_numeric_data()).__finalize__(self) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5cb032521d51a..7a16fb2b6b0d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1752,7 +1752,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, if self.is_categorical_astype(dtype): values = self.values else: - values = np.array(self.values).astype(dtype) + values = np.asarray(self.values).astype(dtype, copy=False) if copy: values = values.copy() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e784934ea28b2..be81993e1aab7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1782,13 +1782,13 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, return self.set_atom_timedelta64(block) dtype = block.dtype.name - rvalues = block.values.ravel() - inferred_type = lib.infer_dtype(rvalues) + inferred_type = lib.infer_dtype(block.values) if inferred_type == 'date': raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': + rvalues = block.values.ravel() if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise @@ -1917,7 +1917,7 @@ def get_atom_data(self, block, kind=None): def set_atom_data(self, block): self.kind = block.dtype.name self.typ = self.get_atom_data(block) - self.set_data(block.values.astype(self.typ.type)) + self.set_data(block.values.astype(self.typ.type, copy=False)) def set_atom_categorical(self, block, items, info=None, values=None): # currently only supports a 1-D categorical @@ -2016,7 +2016,7 @@ def convert(self, values, nan_rep, encoding): index = DatetimeIndex( self.data.ravel(), tz='UTC').tz_convert(self.tz) - self.data = np.array( + self.data = np.asarray( index.tolist(), dtype=object).reshape(self.data.shape) else: @@ -2026,14 +2026,14 @@ def convert(self, values, nan_rep, encoding): self.data = np.asarray(self.data, dtype='m8[ns]') elif dtype == u('date'): try: - self.data = np.array( + self.data = np.asarray( [date.fromordinal(v) for v in self.data], dtype=object) except ValueError: - self.data = np.array( + self.data = np.asarray( [date.fromtimestamp(v) for v in self.data], dtype=object) elif dtype == u('datetime'): - self.data = np.array( + self.data = np.asarray( [datetime.fromtimestamp(v) for v in self.data], dtype=object) @@ -2048,9 +2048,9 @@ def convert(self, values, nan_rep, encoding): else: try: - self.data = self.data.astype(dtype) + self.data = self.data.astype(dtype, copy=False) except: - self.data = self.data.astype('O') + self.data = self.data.astype('O', copy=False) # convert nans / decode if _ensure_decoded(self.kind) == u('string'): @@ -2337,9 +2337,9 @@ def read_array(self, key): ret = data if dtype == u('datetime64'): - ret = np.array(ret, dtype='M8[ns]') + ret = np.asarray(ret, dtype='M8[ns]') elif dtype == u('timedelta64'): - ret = np.array(ret, dtype='m8[ns]') + ret = np.asarray(ret, dtype='m8[ns]') if transposed: return ret.T @@ -3793,7 +3793,7 @@ def write_data(self, chunksize, dropna=True): # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1')) + masks.append(mask.astype('u1', copy=False)) # consolidate masks mask = masks[0] @@ -3803,8 +3803,7 @@ def write_data(self, chunksize, dropna=True): else: - mask = np.empty(nrows, dtype='u1') - mask.fill(False) + mask = None # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] @@ -3833,12 +3832,13 @@ def write_data(self, chunksize, dropna=True): bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape - bvalues.append(values[i].ravel().reshape(new_shape)) + bvalues.append(values[i].reshape(new_shape)) # write the chunks if chunksize is None: chunksize = 100000 + rows = np.empty(min(chunksize,nrows), dtype=self.dtype) chunks = int(nrows / chunksize) + 1 for i in range(chunks): start_i = i * chunksize @@ -3847,11 +3847,20 @@ def write_data(self, chunksize, dropna=True): break self.write_data_chunk( + rows, indexes=[a[start_i:end_i] for a in bindexes], - mask=mask[start_i:end_i], + mask=mask[start_i:end_i] if mask is not None else None, values=[v[start_i:end_i] for v in bvalues]) - def write_data_chunk(self, indexes, mask, values): + def write_data_chunk(self, rows, indexes, mask, values): + """ + Parameters + ---------- + rows : an empty memory space where we are putting the chunk + indexes : an array of the indexes + mask : an array of the masks + values : an array of the values + """ # 0 len for v in values: @@ -3860,7 +3869,8 @@ def write_data_chunk(self, indexes, mask, values): try: nrows = indexes[0].shape[0] - rows = np.empty(nrows, dtype=self.dtype) + if nrows != len(rows): + rows = np.empty(nrows, dtype=self.dtype) names = self.dtype.names nindexes = len(indexes) @@ -3873,7 +3883,10 @@ def write_data_chunk(self, indexes, mask, values): rows[names[i + nindexes]] = v # mask - rows = rows[~mask.ravel().astype(bool)] + if mask is not None: + m = ~mask.ravel().astype(bool, copy=False) + if not m.all(): + rows = rows[m] except Exception as detail: raise Exception("cannot create row-data -> %s" % detail) @@ -4240,14 +4253,14 @@ def _convert_index(index, encoding=None, format_type=None): tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': - converted = np.array([(time.mktime(v.timetuple()) + - v.microsecond / 1E6) for v in values], - dtype=np.float64) + converted = np.asarray([(time.mktime(v.timetuple()) + + v.microsecond / 1E6) for v in values], + dtype=np.float64) return IndexCol(converted, 'datetime', _tables().Time64Col(), index_name=index_name) elif inferred_type == 'date': - converted = np.array([v.toordinal() for v in values], - dtype=np.int32) + converted = np.asarray([v.toordinal() for v in values], + dtype=np.int32) return IndexCol(converted, 'date', _tables().Time32Col(), index_name=index_name) elif inferred_type == 'string': @@ -4290,21 +4303,21 @@ def _unconvert_index(data, kind, encoding=None): if kind == u('datetime64'): index = DatetimeIndex(data) elif kind == u('datetime'): - index = np.array([datetime.fromtimestamp(v) for v in data], - dtype=object) + index = np.asarray([datetime.fromtimestamp(v) for v in data], + dtype=object) elif kind == u('date'): try: - index = np.array( + index = np.asarray( [date.fromordinal(v) for v in data], dtype=object) except (ValueError): - index = np.array( + index = np.asarray( [date.fromtimestamp(v) for v in data], dtype=object) elif kind in (u('integer'), u('float')): - index = np.array(data) + index = np.asarray(data) elif kind in (u('string')): index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) elif kind == u('object'): - index = np.array(data[0]) + index = np.asarray(data[0]) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index @@ -4315,7 +4328,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): if kind == u('datetime'): index = lib.time64_to_datetime(data) elif kind in (u('integer')): - index = np.array(data, dtype=object) + index = np.asarray(data, dtype=object) elif kind in (u('string')): index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) else: # pragma: no cover @@ -4334,13 +4347,13 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data, dtype="S%d" % itemsize) + data = np.asarray(data, dtype="S%d" % itemsize) return data def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(), dtype=object) + data = np.asarray(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) @@ -4353,7 +4366,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): dtype = "U{0}".format(itemsize) else: dtype = "S{0}".format(itemsize) - data = data.astype(dtype).astype(object) + data = data.astype(dtype, copy=False).astype(object, copy=False) except (Exception) as e: f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) @@ -4376,7 +4389,7 @@ def _maybe_convert(values, val_kind, encoding): def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': - return lambda x: np.array(x, dtype='M8[ns]') + return lambda x: np.asarray(x, dtype='M8[ns]') elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': @@ -4421,7 +4434,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): try: inferred = lib.infer_dtype(where) if inferred == 'integer' or inferred == 'boolean': - where = np.array(where) + where = np.asarray(where) if where.dtype == np.bool_: start, stop = self.start, self.stop if start is None: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e95d46f66f17f..41287ebba1ba3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4593,12 +4593,17 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'])) - + # basic + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False)) store.append('s', s, format='table') result = store.select('s') tm.assert_series_equal(s, result) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True)) + store.append('s_ordered', s, format='table') + result = store.select('s_ordered') + tm.assert_series_equal(s, result) + df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) store.append('df', df, format='table') result = store.select('df') @@ -4639,6 +4644,10 @@ def test_categorical(self): result = store.select('df3', where = ['s in ["b","c"]']) tm.assert_frame_equal(result, expected) + expected = df[df.s.isin(['b','c'])] + result = store.select('df3', where = ['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + expected = df[df.s.isin(['d'])] result = store.select('df3', where = ['s in ["d"]']) tm.assert_frame_equal(result, expected)