PERF: optimize memory usage for to_hdf

jreback · jreback · commit 4f6750147511 · 2015-03-15T18:56:42.000-04:00
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -514,6 +514,7 @@ Performance Improvements
 - Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
 - Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`)
 - Cythonized ``Period`` (:issue:`9440`)
+- Decreased memory usage on ``to_hdf`` (:issue:`9648`)
 
 .. _whatsnew_0160.bug_fixes:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2014,6 +2014,14 @@ def __setattr__(self, name, value):
     #----------------------------------------------------------------------
     # Consolidation of internals
 
+    def _protect_consolidate(self, f):
+        """ consolidate _data. if the blocks have changed, then clear the cache """
+        blocks_before = len(self._data.blocks)
+        result = f()
+        if len(self._data.blocks) != blocks_before:
+            self._clear_item_cache()
+        return result
+
     def _consolidate_inplace(self):
         f = lambda: self._data.consolidate()
         self._data = self._protect_consolidate(f)
@@ -2038,8 +2046,6 @@ def consolidate(self, inplace=False):
         else:
             f = lambda: self._data.consolidate()
             cons_data = self._protect_consolidate(f)
-            if cons_data is self._data:
-                cons_data = cons_data.copy()
             return self._constructor(cons_data).__finalize__(self)
 
     @property
@@ -2075,13 +2081,6 @@ def _check_inplace_setting(self, value):
 
         return True
 
-    def _protect_consolidate(self, f):
-        blocks_before = len(self._data.blocks)
-        result = f()
-        if len(self._data.blocks) != blocks_before:
-            self._clear_item_cache()
-        return result
-
     def _get_numeric_data(self):
         return self._constructor(
             self._data.get_numeric_data()).__finalize__(self)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1752,7 +1752,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
         if self.is_categorical_astype(dtype):
             values = self.values
         else:
-            values = np.array(self.values).astype(dtype)
+            values = np.asarray(self.values).astype(dtype, copy=False)
 
         if copy:
             values = values.copy()
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1782,13 +1782,13 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
             return self.set_atom_timedelta64(block)
 
         dtype = block.dtype.name
-        rvalues = block.values.ravel()
-        inferred_type = lib.infer_dtype(rvalues)
+        inferred_type = lib.infer_dtype(block.values)
 
         if inferred_type == 'date':
             raise TypeError(
                 "[date] is not implemented as a table column")
         elif inferred_type == 'datetime':
+            rvalues = block.values.ravel()
             if getattr(rvalues[0], 'tzinfo', None) is not None:
 
                 # if this block has more than one timezone, raise
@@ -1917,7 +1917,7 @@ def get_atom_data(self, block, kind=None):
     def set_atom_data(self, block):
         self.kind = block.dtype.name
         self.typ = self.get_atom_data(block)
-        self.set_data(block.values.astype(self.typ.type))
+        self.set_data(block.values.astype(self.typ.type, copy=False))
 
     def set_atom_categorical(self, block, items, info=None, values=None):
         # currently only supports a 1-D categorical
@@ -2016,7 +2016,7 @@ def convert(self, values, nan_rep, encoding):
 
                     index = DatetimeIndex(
                         self.data.ravel(), tz='UTC').tz_convert(self.tz)
-                    self.data = np.array(
+                    self.data = np.asarray(
                         index.tolist(), dtype=object).reshape(self.data.shape)
 
                 else:
@@ -2026,14 +2026,14 @@ def convert(self, values, nan_rep, encoding):
                 self.data = np.asarray(self.data, dtype='m8[ns]')
             elif dtype == u('date'):
                 try:
-                    self.data = np.array(
+                    self.data = np.asarray(
                         [date.fromordinal(v) for v in self.data], dtype=object)
                 except ValueError:
-                    self.data = np.array(
+                    self.data = np.asarray(
                         [date.fromtimestamp(v) for v in self.data],
                         dtype=object)
             elif dtype == u('datetime'):
-                self.data = np.array(
+                self.data = np.asarray(
                     [datetime.fromtimestamp(v) for v in self.data],
                     dtype=object)
 
@@ -2048,9 +2048,9 @@ def convert(self, values, nan_rep, encoding):
             else:
 
                 try:
-                    self.data = self.data.astype(dtype)
+                    self.data = self.data.astype(dtype, copy=False)
                 except:
-                    self.data = self.data.astype('O')
+                    self.data = self.data.astype('O', copy=False)
 
         # convert nans / decode
         if _ensure_decoded(self.kind) == u('string'):
@@ -2337,9 +2337,9 @@ def read_array(self, key):
                 ret = data
 
             if dtype == u('datetime64'):
-                ret = np.array(ret, dtype='M8[ns]')
+                ret = np.asarray(ret, dtype='M8[ns]')
             elif dtype == u('timedelta64'):
-                ret = np.array(ret, dtype='m8[ns]')
+                ret = np.asarray(ret, dtype='m8[ns]')
 
         if transposed:
             return ret.T
@@ -3793,7 +3793,7 @@ def write_data(self, chunksize, dropna=True):
                 # figure the mask: only do if we can successfully process this
                 # column, otherwise ignore the mask
                 mask = com.isnull(a.data).all(axis=0)
-                masks.append(mask.astype('u1'))
+                masks.append(mask.astype('u1', copy=False))
 
             # consolidate masks
             mask = masks[0]
@@ -3803,8 +3803,7 @@ def write_data(self, chunksize, dropna=True):
 
         else:
 
-            mask = np.empty(nrows, dtype='u1')
-            mask.fill(False)
+            mask = None
 
         # broadcast the indexes if needed
         indexes = [a.cvalues for a in self.index_axes]
@@ -3833,12 +3832,13 @@ def write_data(self, chunksize, dropna=True):
         bvalues = []
         for i, v in enumerate(values):
             new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
-            bvalues.append(values[i].ravel().reshape(new_shape))
+            bvalues.append(values[i].reshape(new_shape))
 
         # write the chunks
         if chunksize is None:
             chunksize = 100000
 
+        rows = np.empty(min(chunksize,nrows), dtype=self.dtype)
         chunks = int(nrows / chunksize) + 1
         for i in range(chunks):
             start_i = i * chunksize
@@ -3847,11 +3847,20 @@ def write_data(self, chunksize, dropna=True):
                 break
 
             self.write_data_chunk(
+                rows,
                 indexes=[a[start_i:end_i] for a in bindexes],
-                mask=mask[start_i:end_i],
+                mask=mask[start_i:end_i] if mask is not None else None,
                 values=[v[start_i:end_i] for v in bvalues])
 
-    def write_data_chunk(self, indexes, mask, values):
+    def write_data_chunk(self, rows, indexes, mask, values):
+        """
+        Parameters
+        ----------
+        rows : an empty memory space where we are putting the chunk
+        indexes : an array of the indexes
+        mask : an array of the masks
+        values : an array of the values
+        """
 
         # 0 len
         for v in values:
@@ -3860,7 +3869,8 @@ def write_data_chunk(self, indexes, mask, values):
 
         try:
             nrows = indexes[0].shape[0]
-            rows = np.empty(nrows, dtype=self.dtype)
+            if nrows != len(rows):
+                rows = np.empty(nrows, dtype=self.dtype)
             names = self.dtype.names
             nindexes = len(indexes)
 
@@ -3873,7 +3883,10 @@ def write_data_chunk(self, indexes, mask, values):
                 rows[names[i + nindexes]] = v
 
             # mask
-            rows = rows[~mask.ravel().astype(bool)]
+            if mask is not None:
+                m = ~mask.ravel().astype(bool, copy=False)
+                if not m.all():
+                    rows = rows[m]
 
         except Exception as detail:
             raise Exception("cannot create row-data -> %s" % detail)
@@ -4240,14 +4253,14 @@ def _convert_index(index, encoding=None, format_type=None):
                         tz=getattr(index, 'tz', None),
                         index_name=index_name)
     elif inferred_type == 'datetime':
-        converted = np.array([(time.mktime(v.timetuple()) +
-                               v.microsecond / 1E6) for v in values],
-                             dtype=np.float64)
+        converted = np.asarray([(time.mktime(v.timetuple()) +
+                                 v.microsecond / 1E6) for v in values],
+                               dtype=np.float64)
         return IndexCol(converted, 'datetime', _tables().Time64Col(),
                         index_name=index_name)
     elif inferred_type == 'date':
-        converted = np.array([v.toordinal() for v in values],
-                             dtype=np.int32)
+        converted = np.asarray([v.toordinal() for v in values],
+                               dtype=np.int32)
         return IndexCol(converted, 'date', _tables().Time32Col(),
                         index_name=index_name)
     elif inferred_type == 'string':
@@ -4290,21 +4303,21 @@ def _unconvert_index(data, kind, encoding=None):
     if kind == u('datetime64'):
         index = DatetimeIndex(data)
     elif kind == u('datetime'):
-        index = np.array([datetime.fromtimestamp(v) for v in data],
-                         dtype=object)
+        index = np.asarray([datetime.fromtimestamp(v) for v in data],
+                           dtype=object)
     elif kind == u('date'):
         try:
-            index = np.array(
+            index = np.asarray(
                 [date.fromordinal(v) for v in data], dtype=object)
         except (ValueError):
-            index = np.array(
+            index = np.asarray(
                 [date.fromtimestamp(v) for v in data], dtype=object)
     elif kind in (u('integer'), u('float')):
-        index = np.array(data)
+        index = np.asarray(data)
     elif kind in (u('string')):
         index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
     elif kind == u('object'):
-        index = np.array(data[0])
+        index = np.asarray(data[0])
     else:  # pragma: no cover
         raise ValueError('unrecognized index type %s' % kind)
     return index
@@ -4315,7 +4328,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None):
     if kind == u('datetime'):
         index = lib.time64_to_datetime(data)
     elif kind in (u('integer')):
-        index = np.array(data, dtype=object)
+        index = np.asarray(data, dtype=object)
     elif kind in (u('string')):
         index = _unconvert_string_array(data, nan_rep=None, encoding=encoding)
     else:  # pragma: no cover
@@ -4334,13 +4347,13 @@ def _convert_string_array(data, encoding, itemsize=None):
     if itemsize is None:
         itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
 
-    data = np.array(data, dtype="S%d" % itemsize)
+    data = np.asarray(data, dtype="S%d" % itemsize)
     return data
 
 def _unconvert_string_array(data, nan_rep=None, encoding=None):
     """ deserialize a string array, possibly decoding """
     shape = data.shape
-    data = np.array(data.ravel(), dtype=object)
+    data = np.asarray(data.ravel(), dtype=object)
 
     # guard against a None encoding in PY3 (because of a legacy
     # where the passed encoding is actually None)
@@ -4353,7 +4366,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
                 dtype = "U{0}".format(itemsize)
             else:
                 dtype = "S{0}".format(itemsize)
-            data = data.astype(dtype).astype(object)
+            data = data.astype(dtype, copy=False).astype(object, copy=False)
         except (Exception) as e:
             f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
             data = f(data)
@@ -4376,7 +4389,7 @@ def _maybe_convert(values, val_kind, encoding):
 def _get_converter(kind, encoding):
     kind = _ensure_decoded(kind)
     if kind == 'datetime64':
-        return lambda x: np.array(x, dtype='M8[ns]')
+        return lambda x: np.asarray(x, dtype='M8[ns]')
     elif kind == 'datetime':
         return lib.convert_timestamps
     elif kind == 'string':
@@ -4421,7 +4434,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs):
             try:
                 inferred = lib.infer_dtype(where)
                 if inferred == 'integer' or inferred == 'boolean':
-                    where = np.array(where)
+                    where = np.asarray(where)
                     if where.dtype == np.bool_:
                         start, stop = self.start, self.stop
                         if start is None:
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -4593,12 +4593,17 @@ def test_categorical(self):
 
         with ensure_clean_store(self.path) as store:
 
-            s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d']))
-
+            # basic
+            s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False))
             store.append('s', s, format='table')
             result = store.select('s')
             tm.assert_series_equal(s, result)
 
+            s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True))
+            store.append('s_ordered', s, format='table')
+            result = store.select('s_ordered')
+            tm.assert_series_equal(s, result)
+
             df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]})
             store.append('df', df, format='table')
             result = store.select('df')
@@ -4639,6 +4644,10 @@ def test_categorical(self):
             result = store.select('df3', where = ['s in ["b","c"]'])
             tm.assert_frame_equal(result, expected)
 
+            expected = df[df.s.isin(['b','c'])]
+            result = store.select('df3', where = ['s = ["b","c"]'])
+            tm.assert_frame_equal(result, expected)
+
             expected = df[df.s.isin(['d'])]
             result = store.select('df3', where = ['s in ["d"]'])
             tm.assert_frame_equal(result, expected)