diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f08fe96704b2a..2afce9e4545b4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1981,10 +1981,9 @@ def infer(self, handler: "Table"): new_self.read_metadata(handler) return new_self - def convert( - self, values: np.ndarray, nan_rep, encoding, errors, start=None, stop=None - ): + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): """ set the values from this selection: take = take ownership """ + assert isinstance(values, np.ndarray), type(values) # values is a recarray if values.dtype.fields is not None: @@ -1993,21 +1992,23 @@ def convert( values = _maybe_convert(values, self.kind, encoding, errors) kwargs = dict() + kwargs["name"] = _ensure_decoded(self.index_name) + if self.freq is not None: kwargs["freq"] = _ensure_decoded(self.freq) - if self.index_name is not None: - kwargs["name"] = _ensure_decoded(self.index_name) + # making an Index instance could throw a number of different errors try: - self.values = Index(values, **kwargs) + new_pd_index = Index(values, **kwargs) except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') if "freq" in kwargs: kwargs["freq"] = None - self.values = Index(values, **kwargs) + new_pd_index = Index(values, **kwargs) - self.values = _set_tz(self.values, self.tz) + new_pd_index = _set_tz(new_pd_index, self.tz) + self.values = new_pd_index def take_data(self): """ return the values & release the memory """ @@ -2167,35 +2168,19 @@ class GenericIndexCol(IndexCol): def is_indexed(self) -> bool: return False - def convert( - self, - values, - nan_rep, - encoding, - errors, - start: Optional[int] = None, - stop: Optional[int] = None, - ): - """ set the values from this selection: take = take ownership + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Set the values from this selection. Parameters ---------- - values : np.ndarray nan_rep : str encoding : str errors : str - start : int, optional - Table row number: the start of the sub-selection. - stop : int, optional - Table row number: the end of the sub-selection. Values larger than - the underlying table's row count are normalized to that. """ - assert self.table is not None # for mypy - - _start = start if start is not None else 0 - _stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows - self.values = Int64Index(np.arange(_stop - _start)) + assert isinstance(values, np.ndarray), type(values) + self.values = Int64Index(np.arange(len(values))) def get_attr(self): pass @@ -2395,10 +2380,11 @@ def validate_attr(self, append): "items dtype in table!" ) - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): """set the data from this selection (and convert to the correct dtype if we can) """ + assert isinstance(values, np.ndarray), type(values) # values is a recarray if values.dtype.fields is not None: @@ -2410,69 +2396,74 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): else: self.data = values + own_data = self.data + # use the meta if needed meta = _ensure_decoded(self.meta) + assert self.dtype is not None + # convert to the correct dtype - if self.dtype is not None: - dtype = _ensure_decoded(self.dtype) + dtype = _ensure_decoded(self.dtype) - # reverse converts - if dtype == "datetime64": + # reverse converts + if dtype == "datetime64": - # recreate with tz if indicated - self.data = _set_tz(self.data, self.tz, coerce=True) + # recreate with tz if indicated + own_data = _set_tz(own_data, self.tz, coerce=True) - elif dtype == "timedelta64": - self.data = np.asarray(self.data, dtype="m8[ns]") - elif dtype == "date": - try: - self.data = np.asarray( - [date.fromordinal(v) for v in self.data], dtype=object - ) - except ValueError: - self.data = np.asarray( - [date.fromtimestamp(v) for v in self.data], dtype=object - ) - - elif meta == "category": - - # we have a categorical - categories = self.metadata - codes = self.data.ravel() - - # if we have stored a NaN in the categories - # then strip it; in theory we could have BOTH - # -1s in the codes and nulls :< - if categories is None: - # Handle case of NaN-only categorical columns in which case - # the categories are an empty array; when this is stored, - # pytables cannot write a zero-len array, so on readback - # the categories would be None and `read_hdf()` would fail. - categories = Index([], dtype=np.float64) - else: - mask = isna(categories) - if mask.any(): - categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values - - self.data = Categorical.from_codes( - codes, categories=categories, ordered=self.ordered + elif dtype == "timedelta64": + own_data = np.asarray(own_data, dtype="m8[ns]") + elif dtype == "date": + try: + own_data = np.asarray( + [date.fromordinal(v) for v in own_data], dtype=object + ) + except ValueError: + own_data = np.asarray( + [date.fromtimestamp(v) for v in own_data], dtype=object ) + elif meta == "category": + + # we have a categorical + categories = self.metadata + codes = own_data.ravel() + + # if we have stored a NaN in the categories + # then strip it; in theory we could have BOTH + # -1s in the codes and nulls :< + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = Index([], dtype=np.float64) else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values - try: - self.data = self.data.astype(dtype, copy=False) - except TypeError: - self.data = self.data.astype("O", copy=False) + own_data = Categorical.from_codes( + codes, categories=categories, ordered=self.ordered + ) + + else: + + try: + own_data = own_data.astype(dtype, copy=False) + except TypeError: + own_data = own_data.astype("O", copy=False) # convert nans / decode if _ensure_decoded(self.kind) == "string": - self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding, errors=errors + own_data = _unconvert_string_array( + own_data, nan_rep=nan_rep, encoding=encoding, errors=errors ) + self.data = own_data + def get_attr(self): """ get the data for this column """ self.values = getattr(self.attrs, self.kind_attr, None) @@ -3613,8 +3604,6 @@ def read_axes( nan_rep=self.nan_rep, encoding=self.encoding, errors=self.errors, - start=start, - stop=stop, ) return True @@ -4887,16 +4876,15 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): return data.reshape(shape) -def _maybe_convert(values: np.ndarray, val_kind, encoding, errors): +def _maybe_convert(values: np.ndarray, val_kind, encoding: str, errors: str): val_kind = _ensure_decoded(val_kind) if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) - # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind: str, encoding, errors): +def _get_converter(kind: str, encoding: str, errors: str): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") elif kind == "string":