diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a95d7f39ab82c..f08fe96704b2a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2221,15 +2221,6 @@ class DataCol(IndexCol): is_data_indexable = False _info_fields = ["tz", "ordered"] - @classmethod - def create_for_block(cls, name: str, version, pos: int): - """ return a new datacol with the block i """ - assert isinstance(name, str) - - cname = name - name = _maybe_adjust_name(name, version) - return cls(name=name, cname=cname, pos=pos) - def __init__( self, name: str, values=None, kind=None, typ=None, cname=None, pos=None, ): @@ -2269,6 +2260,7 @@ def __eq__(self, other: Any) -> bool: def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): assert data is not None + assert self.dtype is None if is_categorical_dtype(data.dtype): data = data.codes @@ -2282,44 +2274,14 @@ def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): # doing that doesnt seem to break anything. why? self.data = data - - if self.dtype is None: - self.dtype = dtype_name - self.set_kind() + self.dtype = dtype_name + self.kind = _dtype_to_kind(dtype_name) def take_data(self): """ return the data & release the memory """ self.data, data = None, self.data return data - def set_kind(self): - # set my kind if we can - - if self.dtype is not None: - dtype = _ensure_decoded(self.dtype) - - if dtype.startswith("string") or dtype.startswith("bytes"): - self.kind = "string" - elif dtype.startswith("float"): - self.kind = "float" - elif dtype.startswith("complex"): - self.kind = "complex" - elif dtype.startswith("int") or dtype.startswith("uint"): - self.kind = "integer" - elif dtype.startswith("date"): - # in tests this is always "datetime64" - self.kind = "datetime" - elif dtype.startswith("timedelta"): - self.kind = "timedelta" - elif dtype.startswith("bool"): - self.kind = "bool" - else: - raise AssertionError(f"cannot interpret dtype of [{dtype}] in [{self}]") - - # set my typ if we need - if self.typ is None: - self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block): """ create and setup my atom from the block b """ @@ -2442,8 +2404,11 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): if values.dtype.fields is not None: values = values[self.cname] - # NB: unlike in the other calls to set_data, self.dtype may not be None here - self.set_data(values) + assert self.typ is not None + if self.dtype is None: + self.set_data(values) + else: + self.data = values # use the meta if needed meta = _ensure_decoded(self.meta) @@ -2513,14 +2478,16 @@ def get_attr(self): self.values = getattr(self.attrs, self.kind_attr, None) self.dtype = getattr(self.attrs, self.dtype_attr, None) self.meta = getattr(self.attrs, self.meta_attr, None) - self.set_kind() + assert self.typ is not None + assert self.dtype is not None + self.kind = _dtype_to_kind(self.dtype) def set_attr(self): """ set the data for this column """ setattr(self.attrs, self.kind_attr, self.values) setattr(self.attrs, self.meta_attr, self.meta) - if self.dtype is not None: - setattr(self.attrs, self.dtype_attr, self.dtype) + assert self.dtype is not None + setattr(self.attrs, self.dtype_attr, self.dtype) class DataIndexableCol(DataCol): @@ -3501,15 +3468,15 @@ def indexables(self): """ create/cache the indexables if they don't exist """ _indexables = [] + desc = self.description + # Note: each of the `name` kwargs below are str, ensured # by the definition in index_cols. # index columns - _indexables.extend( - [ - IndexCol(name=name, axis=axis, pos=i) - for i, (axis, name) in enumerate(self.attrs.index_cols) - ] - ) + for i, (axis, name) in enumerate(self.attrs.index_cols): + atom = getattr(desc, name) + index_col = IndexCol(name=name, axis=axis, pos=i, typ=atom) + _indexables.append(index_col) # values columns dc = set(self.data_columns) @@ -3520,9 +3487,10 @@ def f(i, c): klass = DataCol if c in dc: klass = DataIndexableCol - return klass.create_for_block( - name=c, pos=base_pos + i, version=self.version - ) + + atom = getattr(desc, c) + adj_name = _maybe_adjust_name(c, self.version) + return klass(name=adj_name, cname=c, pos=base_pos + i, typ=atom) # Note: the definition of `values_cols` ensures that each # `c` below is a str. @@ -3903,9 +3871,15 @@ def get_blk_items(mgr, blocks): adj_name = _maybe_adjust_name(new_name, self.version) typ = klass._get_atom(data_converted) + kind = _dtype_to_kind(data_converted.dtype.name) col = klass( - name=adj_name, cname=new_name, values=list(b_items), typ=typ, pos=j + name=adj_name, + cname=new_name, + values=list(b_items), + typ=typ, + pos=j, + kind=kind, ) col.set_atom(block=b) col.set_data(data_converted) @@ -4527,13 +4501,16 @@ def indexables(self): """ create the indexables from the table description """ d = self.description + # TODO: can we get a typ for this? AFAICT it is the only place + # where we aren't passing one # the index columns is just a simple index _indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): assert isinstance(n, str) - dc = GenericDataIndexableCol(name=n, pos=i, values=[n]) + atom = getattr(d, n) + dc = GenericDataIndexableCol(name=n, pos=i, values=[n], typ=atom) _indexables.append(dc) return _indexables @@ -4959,6 +4936,35 @@ def _maybe_adjust_name(name: str, version) -> str: return name +def _dtype_to_kind(dtype_str: str) -> str: + """ + Find the "kind" string describing the given dtype name. + """ + dtype_str = _ensure_decoded(dtype_str) + + if dtype_str.startswith("string") or dtype_str.startswith("bytes"): + kind = "string" + elif dtype_str.startswith("float"): + kind = "float" + elif dtype_str.startswith("complex"): + kind = "complex" + elif dtype_str.startswith("int") or dtype_str.startswith("uint"): + kind = "integer" + elif dtype_str.startswith("date"): + # in tests this is always "datetime64" + kind = "datetime" + elif dtype_str.startswith("timedelta"): + kind = "timedelta" + elif dtype_str.startswith("bool"): + kind = "bool" + elif dtype_str.startswith("category"): + kind = "category" + else: + raise ValueError(f"cannot interpret dtype of [{dtype_str}]") + + return kind + + class Selection: """ Carries out a selection operation on a tables.Table object.