From 2612c44b330089f1f81afd449d9d9a3670b917ca Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Dec 2019 13:38:08 -0800 Subject: [PATCH 1/4] implement _maybe_convert_for_string_atom --- pandas/io/pytables.py | 207 ++++++++++++++++++++---------------------- 1 file changed, 100 insertions(+), 107 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5a42df92ddf84..4c312105f6a36 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2198,115 +2198,36 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom( - self, - block, - block_items, - existing_col, - min_itemsize, - nan_rep, - info, - encoding=None, - errors="strict", - ): + def set_atom(self, block, itemsize: int, data_converted, use_str: bool): """ create and setup my atom from the block b """ - self.values = list(block_items) - # short-cut certain block types if block.is_categorical: - return self.set_atom_categorical(block, items=block_items, info=info) + self.set_atom_categorical(block) elif block.is_datetimetz: - return self.set_atom_datetime64tz(block, info=info) + self.set_atom_datetime64tz(block) elif block.is_datetime: - return self.set_atom_datetime64(block) + self.set_atom_datetime64(block) elif block.is_timedelta: - return self.set_atom_timedelta64(block) + self.set_atom_timedelta64(block) elif block.is_complex: - return self.set_atom_complex(block) - - dtype = block.dtype.name - inferred_type = lib.infer_dtype(block.values, skipna=False) + self.set_atom_complex(block) - if inferred_type == "date": - raise TypeError("[date] is not implemented as a table column") - elif inferred_type == "datetime": - # after GH#8260 - # this only would be hit for a multi-timezone dtype - # which is an error - - raise TypeError( - "too many timezones in this block, create separate data columns" - ) - elif inferred_type == "unicode": - raise TypeError("[unicode] is not implemented as a table column") - - # this is basically a catchall; if say a datetime64 has nans then will - # end up here ### - elif inferred_type == "string" or dtype == "object": - self.set_atom_string( - block, - block_items, - existing_col, - min_itemsize, - nan_rep, - encoding, - errors, - ) - - # set as a data block + elif use_str: + # TODO: should we reshape data_converted to orig shape? + self.set_atom_string(itemsize, data_converted, block.shape) else: + # set as a data block self.set_atom_data(block) - def get_atom_string(self, block, itemsize): - return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) + def get_atom_string(self, shape, itemsize): + return _tables().StringCol(itemsize=itemsize, shape=shape[0]) - def set_atom_string( - self, block, block_items, existing_col, min_itemsize, nan_rep, encoding, errors - ): - # fill nan items with myself, don't disturb the blocks by - # trying to downcast - block = block.fillna(nan_rep, downcast=False) - if isinstance(block, list): - block = block[0] - data = block.values - - # see if we have a valid string type - inferred_type = lib.infer_dtype(data.ravel(), skipna=False) - if inferred_type != "string": - - # we cannot serialize this data, so report an exception on a column - # by column basis - for i, item in enumerate(block_items): - - col = block.iget(i) - inferred_type = lib.infer_dtype(col.ravel(), skipna=False) - if inferred_type != "string": - raise TypeError( - f"Cannot serialize the column [{item}] because\n" - f"its data contents are [{inferred_type}] object dtype" - ) - - # itemsize is the maximum length of a string (along any dimension) - data_converted = _convert_string_array(data, encoding, errors) - itemsize = data_converted.itemsize - - # specified min_itemsize? - if isinstance(min_itemsize, dict): - min_itemsize = int( - min_itemsize.get(self.name) or min_itemsize.get("values") or 0 - ) - itemsize = max(min_itemsize or 0, itemsize) - - # check for column in the values conflicts - if existing_col is not None: - eci = existing_col.validate_col(itemsize) - if eci > itemsize: - itemsize = eci + def set_atom_string(self, itemsize, data_converted, shape): self.itemsize = itemsize self.kind = "string" - self.typ = self.get_atom_string(block, itemsize) + self.typ = self.get_atom_string(shape, itemsize) self.set_data(data_converted.astype(f"|S{itemsize}", copy=False)) def get_atom_coltype(self, kind=None): @@ -2336,7 +2257,7 @@ def set_atom_data(self, block): self.typ = self.get_atom_data(block) self.set_data(block.values.astype(self.typ.type, copy=False)) - def set_atom_categorical(self, block, items, info=None): + def set_atom_categorical(self, block): # currently only supports a 1-D categorical # in a 1-D block @@ -2346,8 +2267,6 @@ def set_atom_categorical(self, block, items, info=None): self.dtype = codes.dtype.name if values.ndim > 1: raise NotImplementedError("only support 1-d categoricals") - if len(items) > 1: - raise NotImplementedError("only support single block categoricals") # write the codes; must be in a block shape self.ordered = values.ordered @@ -2358,9 +2277,6 @@ def set_atom_categorical(self, block, items, info=None): self.meta = "category" self.set_metadata(block.values.categories) - # update the info - self.update_info(info) - def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) @@ -2370,7 +2286,7 @@ def set_atom_datetime64(self, block): values = block.values.view("i8") self.set_data(values, "datetime64") - def set_atom_datetime64tz(self, block, info): + def set_atom_datetime64tz(self, block): values = block.values @@ -2379,7 +2295,6 @@ def set_atom_datetime64tz(self, block, info): # store a converted timezone self.tz = _get_tz(block.values.tz) - self.update_info(info) self.kind = "datetime64" self.typ = self.get_atom_datetime64(block) @@ -2516,7 +2431,7 @@ def validate_names(self): # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") - def get_atom_string(self, block, itemsize): + def get_atom_string(self, shape, itemsize): return _tables().StringCol(itemsize=itemsize) def get_atom_data(self, block, kind=None): @@ -3872,17 +3787,26 @@ def get_blk_items(mgr, blocks): else: existing_col = None - col = klass.create_for_block(i=i, name=name, version=self.version) - col.set_atom( - block=b, - block_items=b_items, + new_name = name or f"values_block_{i}" + itemsize, data_converted, use_str = _maybe_convert_for_string_atom( + new_name, + b, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, encoding=self.encoding, errors=self.errors, - info=self.info, ) + + col = klass.create_for_block(i=i, name=new_name, version=self.version) + col.values = list(b_items) + col.set_atom( + block=b, + itemsize=itemsize, + data_converted=data_converted, + use_str=use_str, + ) + col.update_info(self.info) col.set_pos(j) self.values_axes.append(col) @@ -4746,6 +4670,75 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"): return index +def _maybe_convert_for_string_atom( + name, block, existing_col, min_itemsize, nan_rep, encoding, errors +): + use_str = False + + if not block.is_object: + return block.dtype.itemsize, block.values, use_str + + dtype_name = block.dtype.name + inferred_type = lib.infer_dtype(block.values, skipna=False) + + if inferred_type == "date": + raise TypeError("[date] is not implemented as a table column") + elif inferred_type == "datetime": + # after GH#8260 + # this only would be hit for a multi-timezone dtype which is an error + raise TypeError( + "too many timezones in this block, create separate data columns" + ) + + elif not (inferred_type == "string" or dtype_name == "object"): + return block.dtype.itemsize, block.values, use_str + + use_str = True + + block = block.fillna(nan_rep, downcast=False) + if isinstance(block, list): + # Note: because block is always object dtype, fillna goes + # through a path such that the result is always a 1-element list + block = block[0] + data = block.values + + # see if we have a valid string type + inferred_type = lib.infer_dtype(data.ravel(), skipna=False) + if inferred_type != "string": + + # we cannot serialize this data, so report an exception on a column + # by column basis + for i in range(len(block.shape[0])): + + col = block.iget(i) + inferred_type = lib.infer_dtype(col.ravel(), skipna=False) + if inferred_type != "string": + iloc = block.mgr_locs.indexer[i] + raise TypeError( + f"Cannot serialize the column [{iloc}] because\n" + f"its data contents are [{inferred_type}] object dtype" + ) + + # itemsize is the maximum length of a string (along any dimension) + data_converted = _convert_string_array( + data, encoding, errors + ) # TODO: do we need to reshape? + itemsize = data_converted.itemsize + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) + itemsize = max(min_itemsize or 0, itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + return itemsize, data_converted, use_str + + def _convert_string_array(data, encoding, errors, itemsize=None): """ we take a string-like that is object dtype and coerce to a fixed size From 4c0004c728451ba1fa2c361c0f0220a7c42e5e3c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Dec 2019 13:43:17 -0800 Subject: [PATCH 2/4] reshape data_converted --- pandas/io/pytables.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4c312105f6a36..3361edc33000e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2214,8 +2214,7 @@ def set_atom(self, block, itemsize: int, data_converted, use_str: bool): self.set_atom_complex(block) elif use_str: - # TODO: should we reshape data_converted to orig shape? - self.set_atom_string(itemsize, data_converted, block.shape) + self.set_atom_string(itemsize, data_converted) else: # set as a data block self.set_atom_data(block) @@ -2223,11 +2222,11 @@ def set_atom(self, block, itemsize: int, data_converted, use_str: bool): def get_atom_string(self, shape, itemsize): return _tables().StringCol(itemsize=itemsize, shape=shape[0]) - def set_atom_string(self, itemsize, data_converted, shape): + def set_atom_string(self, itemsize, data_converted): self.itemsize = itemsize self.kind = "string" - self.typ = self.get_atom_string(shape, itemsize) + self.typ = self.get_atom_string(data_converted.shape, itemsize) self.set_data(data_converted.astype(f"|S{itemsize}", copy=False)) def get_atom_coltype(self, kind=None): @@ -4671,7 +4670,7 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"): def _maybe_convert_for_string_atom( - name, block, existing_col, min_itemsize, nan_rep, encoding, errors + name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors ): use_str = False @@ -4720,9 +4719,9 @@ def _maybe_convert_for_string_atom( ) # itemsize is the maximum length of a string (along any dimension) - data_converted = _convert_string_array( - data, encoding, errors - ) # TODO: do we need to reshape? + data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) + # NB: the reshape here is new! + assert data_converted.shape == block.shape, (data_converted.shape, block.shape) itemsize = data_converted.itemsize # specified min_itemsize? From 958c8102f7b117b39162765a118fb7a0c65d3df8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Dec 2019 13:46:15 -0800 Subject: [PATCH 3/4] types --- pandas/io/pytables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3361edc33000e..802b5d42cc4bc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2222,8 +2222,7 @@ def set_atom(self, block, itemsize: int, data_converted, use_str: bool): def get_atom_string(self, shape, itemsize): return _tables().StringCol(itemsize=itemsize, shape=shape[0]) - def set_atom_string(self, itemsize, data_converted): - + def set_atom_string(self, itemsize: int, data_converted: np.ndarray): self.itemsize = itemsize self.kind = "string" self.typ = self.get_atom_string(data_converted.shape, itemsize) From c1af0c8d394bc2584a72f4bd9562a38bd48766c8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Dec 2019 10:08:08 -0800 Subject: [PATCH 4/4] remove comment --- pandas/io/pytables.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6a41775b2061c..5b7bb64795d90 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4825,7 +4825,6 @@ def _maybe_convert_for_string_atom( # itemsize is the maximum length of a string (along any dimension) data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) - # NB: the reshape here is new! assert data_converted.shape == block.shape, (data_converted.shape, block.shape) itemsize = data_converted.itemsize