-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: pytables do string conversion early to set attributes in fewer places #30058
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
jreback
merged 9 commits into
pandas-dev:master
from
jbrockmendel:cln-pytables-maybe_convert
Dec 5, 2019
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
2612c44
implement _maybe_convert_for_string_atom
jbrockmendel 4c0004c
reshape data_converted
jbrockmendel 958c810
types
jbrockmendel 5485355
Merge branch 'master' of https://github.com/pandas-dev/pandas into cl…
jbrockmendel 433b861
Merge branch 'master' of https://github.com/pandas-dev/pandas into cl…
jbrockmendel c05d82e
Merge branch 'master' of https://github.com/pandas-dev/pandas into cl…
jbrockmendel 1587661
Merge branch 'master' of https://github.com/pandas-dev/pandas into cl…
jbrockmendel b3e958d
Merge branch 'master' of https://github.com/pandas-dev/pandas into cl…
jbrockmendel c1af0c8
remove comment
jbrockmendel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2313,111 +2313,34 @@ def set_kind(self): | |
if self.typ is None: | ||
self.typ = getattr(self.description, self.cname, None) | ||
|
||
def set_atom( | ||
self, | ||
block, | ||
existing_col, | ||
min_itemsize, | ||
nan_rep, | ||
info, | ||
encoding=None, | ||
errors="strict", | ||
): | ||
def set_atom(self, block, itemsize: int, data_converted, use_str: bool): | ||
""" create and setup my atom from the block b """ | ||
|
||
# short-cut certain block types | ||
if block.is_categorical: | ||
self.set_atom_categorical(block) | ||
self.update_info(info) | ||
return | ||
elif block.is_datetimetz: | ||
self.set_atom_datetime64tz(block) | ||
self.update_info(info) | ||
return | ||
elif block.is_datetime: | ||
return self.set_atom_datetime64(block) | ||
self.set_atom_datetime64(block) | ||
elif block.is_timedelta: | ||
return self.set_atom_timedelta64(block) | ||
self.set_atom_timedelta64(block) | ||
elif block.is_complex: | ||
return self.set_atom_complex(block) | ||
|
||
dtype = block.dtype.name | ||
inferred_type = lib.infer_dtype(block.values, skipna=False) | ||
self.set_atom_complex(block) | ||
|
||
if inferred_type == "date": | ||
raise TypeError("[date] is not implemented as a table column") | ||
elif inferred_type == "datetime": | ||
# after GH#8260 | ||
# this only would be hit for a multi-timezone dtype | ||
# which is an error | ||
|
||
raise TypeError( | ||
"too many timezones in this block, create separate data columns" | ||
) | ||
elif inferred_type == "unicode": | ||
raise TypeError("[unicode] is not implemented as a table column") | ||
|
||
# this is basically a catchall; if say a datetime64 has nans then will | ||
# end up here ### | ||
elif inferred_type == "string" or dtype == "object": | ||
self.set_atom_string( | ||
block, existing_col, min_itemsize, nan_rep, encoding, errors, | ||
) | ||
|
||
# set as a data block | ||
elif use_str: | ||
self.set_atom_string(itemsize, data_converted) | ||
else: | ||
# set as a data block | ||
self.set_atom_data(block) | ||
|
||
def get_atom_string(self, block, itemsize): | ||
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) | ||
|
||
def set_atom_string( | ||
self, block, existing_col, min_itemsize, nan_rep, encoding, errors | ||
): | ||
# fill nan items with myself, don't disturb the blocks by | ||
# trying to downcast | ||
block = block.fillna(nan_rep, downcast=False) | ||
if isinstance(block, list): | ||
block = block[0] | ||
data = block.values | ||
|
||
# see if we have a valid string type | ||
inferred_type = lib.infer_dtype(data.ravel(), skipna=False) | ||
if inferred_type != "string": | ||
|
||
# we cannot serialize this data, so report an exception on a column | ||
# by column basis | ||
for i in range(len(block.shape[0])): | ||
|
||
col = block.iget(i) | ||
inferred_type = lib.infer_dtype(col.ravel(), skipna=False) | ||
if inferred_type != "string": | ||
iloc = block.mgr_locs.indexer[i] | ||
raise TypeError( | ||
f"Cannot serialize the column [{iloc}] because\n" | ||
f"its data contents are [{inferred_type}] object dtype" | ||
) | ||
|
||
# itemsize is the maximum length of a string (along any dimension) | ||
data_converted = _convert_string_array(data, encoding, errors) | ||
itemsize = data_converted.itemsize | ||
|
||
# specified min_itemsize? | ||
if isinstance(min_itemsize, dict): | ||
min_itemsize = int( | ||
min_itemsize.get(self.name) or min_itemsize.get("values") or 0 | ||
) | ||
itemsize = max(min_itemsize or 0, itemsize) | ||
|
||
# check for column in the values conflicts | ||
if existing_col is not None: | ||
eci = existing_col.validate_col(itemsize) | ||
if eci > itemsize: | ||
itemsize = eci | ||
def get_atom_string(self, shape, itemsize): | ||
return _tables().StringCol(itemsize=itemsize, shape=shape[0]) | ||
|
||
def set_atom_string(self, itemsize: int, data_converted: np.ndarray): | ||
self.itemsize = itemsize | ||
self.kind = "string" | ||
self.typ = self.get_atom_string(block, itemsize) | ||
self.typ = self.get_atom_string(data_converted.shape, itemsize) | ||
self.set_data(data_converted.astype(f"|S{itemsize}", copy=False)) | ||
|
||
def get_atom_coltype(self, kind=None): | ||
|
@@ -2621,7 +2544,7 @@ def validate_names(self): | |
# TODO: should the message here be more specifically non-str? | ||
raise ValueError("cannot have non-object label DataIndexableCol") | ||
|
||
def get_atom_string(self, block, itemsize): | ||
def get_atom_string(self, shape, itemsize): | ||
return _tables().StringCol(itemsize=itemsize) | ||
|
||
def get_atom_data(self, block, kind=None): | ||
|
@@ -3974,17 +3897,26 @@ def get_blk_items(mgr, blocks): | |
else: | ||
existing_col = None | ||
|
||
col = klass.create_for_block(i=i, name=name, version=self.version) | ||
col.values = list(b_items) | ||
col.set_atom( | ||
block=b, | ||
new_name = name or f"values_block_{i}" | ||
itemsize, data_converted, use_str = _maybe_convert_for_string_atom( | ||
new_name, | ||
b, | ||
existing_col=existing_col, | ||
min_itemsize=min_itemsize, | ||
nan_rep=nan_rep, | ||
encoding=self.encoding, | ||
errors=self.errors, | ||
info=self.info, | ||
) | ||
|
||
col = klass.create_for_block(i=i, name=new_name, version=self.version) | ||
col.values = list(b_items) | ||
col.set_atom( | ||
block=b, | ||
itemsize=itemsize, | ||
data_converted=data_converted, | ||
use_str=use_str, | ||
) | ||
col.update_info(self.info) | ||
col.set_pos(j) | ||
|
||
self.values_axes.append(col) | ||
|
@@ -4842,6 +4774,74 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"): | |
return index | ||
|
||
|
||
def _maybe_convert_for_string_atom( | ||
name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors | ||
): | ||
use_str = False | ||
|
||
if not block.is_object: | ||
return block.dtype.itemsize, block.values, use_str | ||
|
||
dtype_name = block.dtype.name | ||
inferred_type = lib.infer_dtype(block.values, skipna=False) | ||
|
||
if inferred_type == "date": | ||
raise TypeError("[date] is not implemented as a table column") | ||
elif inferred_type == "datetime": | ||
# after GH#8260 | ||
# this only would be hit for a multi-timezone dtype which is an error | ||
raise TypeError( | ||
"too many timezones in this block, create separate data columns" | ||
) | ||
|
||
elif not (inferred_type == "string" or dtype_name == "object"): | ||
return block.dtype.itemsize, block.values, use_str | ||
|
||
use_str = True | ||
|
||
block = block.fillna(nan_rep, downcast=False) | ||
if isinstance(block, list): | ||
# Note: because block is always object dtype, fillna goes | ||
# through a path such that the result is always a 1-element list | ||
block = block[0] | ||
data = block.values | ||
|
||
# see if we have a valid string type | ||
inferred_type = lib.infer_dtype(data.ravel(), skipna=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't you already have this from above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, because we did a fillna on 4802 |
||
if inferred_type != "string": | ||
|
||
# we cannot serialize this data, so report an exception on a column | ||
# by column basis | ||
for i in range(len(block.shape[0])): | ||
|
||
col = block.iget(i) | ||
inferred_type = lib.infer_dtype(col.ravel(), skipna=False) | ||
if inferred_type != "string": | ||
iloc = block.mgr_locs.indexer[i] | ||
raise TypeError( | ||
f"Cannot serialize the column [{iloc}] because\n" | ||
f"its data contents are [{inferred_type}] object dtype" | ||
) | ||
|
||
# itemsize is the maximum length of a string (along any dimension) | ||
data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) | ||
assert data_converted.shape == block.shape, (data_converted.shape, block.shape) | ||
itemsize = data_converted.itemsize | ||
|
||
# specified min_itemsize? | ||
if isinstance(min_itemsize, dict): | ||
min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) | ||
itemsize = max(min_itemsize or 0, itemsize) | ||
|
||
# check for column in the values conflicts | ||
if existing_col is not None: | ||
eci = existing_col.validate_col(itemsize) | ||
if eci > itemsize: | ||
itemsize = eci | ||
|
||
return itemsize, data_converted, use_str | ||
|
||
|
||
def _convert_string_array(data, encoding, errors, itemsize=None): | ||
""" | ||
we take a string-like that is object dtype and coerce to a fixed size | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just cut / paste?
add a doc-string / types at some point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly cut/paste, but from a couple different places instead of one big chunk. The
not block.is_object
check on 4782 is new, wasnt needed before because before we were doing things in a different order.The only real change is the
reshape
on 4827 and assertion on 4828 are new.