Skip to content

Commit 8b31f9b

Browse files
jbrockmendeljreback
authored andcommitted
REF: pytables do string conversion early to set attributes in fewer places (#30058)
1 parent 002da29 commit 8b31f9b

File tree

1 file changed

+94
-94
lines changed

1 file changed

+94
-94
lines changed

pandas/io/pytables.py

+94-94
Original file line numberDiff line numberDiff line change
@@ -2313,111 +2313,34 @@ def set_kind(self):
23132313
if self.typ is None:
23142314
self.typ = getattr(self.description, self.cname, None)
23152315

2316-
def set_atom(
2317-
self,
2318-
block,
2319-
existing_col,
2320-
min_itemsize,
2321-
nan_rep,
2322-
info,
2323-
encoding=None,
2324-
errors="strict",
2325-
):
2316+
def set_atom(self, block, itemsize: int, data_converted, use_str: bool):
23262317
""" create and setup my atom from the block b """
23272318

23282319
# short-cut certain block types
23292320
if block.is_categorical:
23302321
self.set_atom_categorical(block)
2331-
self.update_info(info)
2332-
return
23332322
elif block.is_datetimetz:
23342323
self.set_atom_datetime64tz(block)
2335-
self.update_info(info)
2336-
return
23372324
elif block.is_datetime:
2338-
return self.set_atom_datetime64(block)
2325+
self.set_atom_datetime64(block)
23392326
elif block.is_timedelta:
2340-
return self.set_atom_timedelta64(block)
2327+
self.set_atom_timedelta64(block)
23412328
elif block.is_complex:
2342-
return self.set_atom_complex(block)
2343-
2344-
dtype = block.dtype.name
2345-
inferred_type = lib.infer_dtype(block.values, skipna=False)
2329+
self.set_atom_complex(block)
23462330

2347-
if inferred_type == "date":
2348-
raise TypeError("[date] is not implemented as a table column")
2349-
elif inferred_type == "datetime":
2350-
# after GH#8260
2351-
# this only would be hit for a multi-timezone dtype
2352-
# which is an error
2353-
2354-
raise TypeError(
2355-
"too many timezones in this block, create separate data columns"
2356-
)
2357-
elif inferred_type == "unicode":
2358-
raise TypeError("[unicode] is not implemented as a table column")
2359-
2360-
# this is basically a catchall; if say a datetime64 has nans then will
2361-
# end up here ###
2362-
elif inferred_type == "string" or dtype == "object":
2363-
self.set_atom_string(
2364-
block, existing_col, min_itemsize, nan_rep, encoding, errors,
2365-
)
2366-
2367-
# set as a data block
2331+
elif use_str:
2332+
self.set_atom_string(itemsize, data_converted)
23682333
else:
2334+
# set as a data block
23692335
self.set_atom_data(block)
23702336

2371-
def get_atom_string(self, block, itemsize):
2372-
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
2373-
2374-
def set_atom_string(
2375-
self, block, existing_col, min_itemsize, nan_rep, encoding, errors
2376-
):
2377-
# fill nan items with myself, don't disturb the blocks by
2378-
# trying to downcast
2379-
block = block.fillna(nan_rep, downcast=False)
2380-
if isinstance(block, list):
2381-
block = block[0]
2382-
data = block.values
2383-
2384-
# see if we have a valid string type
2385-
inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
2386-
if inferred_type != "string":
2387-
2388-
# we cannot serialize this data, so report an exception on a column
2389-
# by column basis
2390-
for i in range(len(block.shape[0])):
2391-
2392-
col = block.iget(i)
2393-
inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
2394-
if inferred_type != "string":
2395-
iloc = block.mgr_locs.indexer[i]
2396-
raise TypeError(
2397-
f"Cannot serialize the column [{iloc}] because\n"
2398-
f"its data contents are [{inferred_type}] object dtype"
2399-
)
2400-
2401-
# itemsize is the maximum length of a string (along any dimension)
2402-
data_converted = _convert_string_array(data, encoding, errors)
2403-
itemsize = data_converted.itemsize
2404-
2405-
# specified min_itemsize?
2406-
if isinstance(min_itemsize, dict):
2407-
min_itemsize = int(
2408-
min_itemsize.get(self.name) or min_itemsize.get("values") or 0
2409-
)
2410-
itemsize = max(min_itemsize or 0, itemsize)
2411-
2412-
# check for column in the values conflicts
2413-
if existing_col is not None:
2414-
eci = existing_col.validate_col(itemsize)
2415-
if eci > itemsize:
2416-
itemsize = eci
2337+
def get_atom_string(self, shape, itemsize):
2338+
return _tables().StringCol(itemsize=itemsize, shape=shape[0])
24172339

2340+
def set_atom_string(self, itemsize: int, data_converted: np.ndarray):
24182341
self.itemsize = itemsize
24192342
self.kind = "string"
2420-
self.typ = self.get_atom_string(block, itemsize)
2343+
self.typ = self.get_atom_string(data_converted.shape, itemsize)
24212344
self.set_data(data_converted.astype(f"|S{itemsize}", copy=False))
24222345

24232346
def get_atom_coltype(self, kind=None):
@@ -2621,7 +2544,7 @@ def validate_names(self):
26212544
# TODO: should the message here be more specifically non-str?
26222545
raise ValueError("cannot have non-object label DataIndexableCol")
26232546

2624-
def get_atom_string(self, block, itemsize):
2547+
def get_atom_string(self, shape, itemsize):
26252548
return _tables().StringCol(itemsize=itemsize)
26262549

26272550
def get_atom_data(self, block, kind=None):
@@ -3972,17 +3895,26 @@ def get_blk_items(mgr, blocks):
39723895
else:
39733896
existing_col = None
39743897

3975-
col = klass.create_for_block(i=i, name=name, version=self.version)
3976-
col.values = list(b_items)
3977-
col.set_atom(
3978-
block=b,
3898+
new_name = name or f"values_block_{i}"
3899+
itemsize, data_converted, use_str = _maybe_convert_for_string_atom(
3900+
new_name,
3901+
b,
39793902
existing_col=existing_col,
39803903
min_itemsize=min_itemsize,
39813904
nan_rep=nan_rep,
39823905
encoding=self.encoding,
39833906
errors=self.errors,
3984-
info=self.info,
39853907
)
3908+
3909+
col = klass.create_for_block(i=i, name=new_name, version=self.version)
3910+
col.values = list(b_items)
3911+
col.set_atom(
3912+
block=b,
3913+
itemsize=itemsize,
3914+
data_converted=data_converted,
3915+
use_str=use_str,
3916+
)
3917+
col.update_info(self.info)
39863918
col.set_pos(j)
39873919

39883920
vaxes.append(col)
@@ -4847,6 +4779,74 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"):
48474779
return index
48484780

48494781

4782+
def _maybe_convert_for_string_atom(
4783+
name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors
4784+
):
4785+
use_str = False
4786+
4787+
if not block.is_object:
4788+
return block.dtype.itemsize, block.values, use_str
4789+
4790+
dtype_name = block.dtype.name
4791+
inferred_type = lib.infer_dtype(block.values, skipna=False)
4792+
4793+
if inferred_type == "date":
4794+
raise TypeError("[date] is not implemented as a table column")
4795+
elif inferred_type == "datetime":
4796+
# after GH#8260
4797+
# this only would be hit for a multi-timezone dtype which is an error
4798+
raise TypeError(
4799+
"too many timezones in this block, create separate data columns"
4800+
)
4801+
4802+
elif not (inferred_type == "string" or dtype_name == "object"):
4803+
return block.dtype.itemsize, block.values, use_str
4804+
4805+
use_str = True
4806+
4807+
block = block.fillna(nan_rep, downcast=False)
4808+
if isinstance(block, list):
4809+
# Note: because block is always object dtype, fillna goes
4810+
# through a path such that the result is always a 1-element list
4811+
block = block[0]
4812+
data = block.values
4813+
4814+
# see if we have a valid string type
4815+
inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
4816+
if inferred_type != "string":
4817+
4818+
# we cannot serialize this data, so report an exception on a column
4819+
# by column basis
4820+
for i in range(len(block.shape[0])):
4821+
4822+
col = block.iget(i)
4823+
inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
4824+
if inferred_type != "string":
4825+
iloc = block.mgr_locs.indexer[i]
4826+
raise TypeError(
4827+
f"Cannot serialize the column [{iloc}] because\n"
4828+
f"its data contents are [{inferred_type}] object dtype"
4829+
)
4830+
4831+
# itemsize is the maximum length of a string (along any dimension)
4832+
data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
4833+
assert data_converted.shape == block.shape, (data_converted.shape, block.shape)
4834+
itemsize = data_converted.itemsize
4835+
4836+
# specified min_itemsize?
4837+
if isinstance(min_itemsize, dict):
4838+
min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
4839+
itemsize = max(min_itemsize or 0, itemsize)
4840+
4841+
# check for column in the values conflicts
4842+
if existing_col is not None:
4843+
eci = existing_col.validate_col(itemsize)
4844+
if eci > itemsize:
4845+
itemsize = eci
4846+
4847+
return itemsize, data_converted, use_str
4848+
4849+
48504850
def _convert_string_array(data, encoding, errors, itemsize=None):
48514851
"""
48524852
we take a string-like that is object dtype and coerce to a fixed size

0 commit comments

Comments
 (0)