Skip to content

CLN: clean pytables convert methods #30125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 8, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 70 additions & 82 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1981,10 +1981,9 @@ def infer(self, handler: "Table"):
new_self.read_metadata(handler)
return new_self

def convert(
self, values: np.ndarray, nan_rep, encoding, errors, start=None, stop=None
):
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
""" set the values from this selection: take = take ownership """
assert isinstance(values, np.ndarray), type(values)

# values is a recarray
if values.dtype.fields is not None:
Expand All @@ -1993,21 +1992,23 @@ def convert(
values = _maybe_convert(values, self.kind, encoding, errors)

kwargs = dict()
kwargs["name"] = _ensure_decoded(self.index_name)

if self.freq is not None:
kwargs["freq"] = _ensure_decoded(self.freq)
if self.index_name is not None:
kwargs["name"] = _ensure_decoded(self.index_name)

# making an Index instance could throw a number of different errors
try:
self.values = Index(values, **kwargs)
new_pd_index = Index(values, **kwargs)
except ValueError:
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
if "freq" in kwargs:
kwargs["freq"] = None
self.values = Index(values, **kwargs)
new_pd_index = Index(values, **kwargs)

self.values = _set_tz(self.values, self.tz)
new_pd_index = _set_tz(new_pd_index, self.tz)
self.values = new_pd_index

def take_data(self):
""" return the values & release the memory """
Expand Down Expand Up @@ -2167,35 +2168,19 @@ class GenericIndexCol(IndexCol):
def is_indexed(self) -> bool:
return False

def convert(
self,
values,
nan_rep,
encoding,
errors,
start: Optional[int] = None,
stop: Optional[int] = None,
):
""" set the values from this selection: take = take ownership
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
"""
Set the values from this selection.

Parameters
----------

values : np.ndarray
nan_rep : str
encoding : str
errors : str
start : int, optional
Table row number: the start of the sub-selection.
stop : int, optional
Table row number: the end of the sub-selection. Values larger than
the underlying table's row count are normalized to that.
"""
assert self.table is not None # for mypy

_start = start if start is not None else 0
_stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows
self.values = Int64Index(np.arange(_stop - _start))
assert isinstance(values, np.ndarray), type(values)
self.values = Int64Index(np.arange(len(values)))

def get_attr(self):
pass
Expand Down Expand Up @@ -2395,10 +2380,11 @@ def validate_attr(self, append):
"items dtype in table!"
)

def convert(self, values, nan_rep, encoding, errors, start=None, stop=None):
def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
"""set the data from this selection (and convert to the correct dtype
if we can)
"""
assert isinstance(values, np.ndarray), type(values)

# values is a recarray
if values.dtype.fields is not None:
Expand All @@ -2410,69 +2396,74 @@ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None):
else:
self.data = values

own_data = self.data

# use the meta if needed
meta = _ensure_decoded(self.meta)

assert self.dtype is not None

# convert to the correct dtype
if self.dtype is not None:
dtype = _ensure_decoded(self.dtype)
dtype = _ensure_decoded(self.dtype)

# reverse converts
if dtype == "datetime64":
# reverse converts
if dtype == "datetime64":

# recreate with tz if indicated
self.data = _set_tz(self.data, self.tz, coerce=True)
# recreate with tz if indicated
own_data = _set_tz(own_data, self.tz, coerce=True)

elif dtype == "timedelta64":
self.data = np.asarray(self.data, dtype="m8[ns]")
elif dtype == "date":
try:
self.data = np.asarray(
[date.fromordinal(v) for v in self.data], dtype=object
)
except ValueError:
self.data = np.asarray(
[date.fromtimestamp(v) for v in self.data], dtype=object
)

elif meta == "category":

# we have a categorical
categories = self.metadata
codes = self.data.ravel()

# if we have stored a NaN in the categories
# then strip it; in theory we could have BOTH
# -1s in the codes and nulls :<
if categories is None:
# Handle case of NaN-only categorical columns in which case
# the categories are an empty array; when this is stored,
# pytables cannot write a zero-len array, so on readback
# the categories would be None and `read_hdf()` would fail.
categories = Index([], dtype=np.float64)
else:
mask = isna(categories)
if mask.any():
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values

self.data = Categorical.from_codes(
codes, categories=categories, ordered=self.ordered
elif dtype == "timedelta64":
own_data = np.asarray(own_data, dtype="m8[ns]")
elif dtype == "date":
try:
own_data = np.asarray(
[date.fromordinal(v) for v in own_data], dtype=object
)
except ValueError:
own_data = np.asarray(
[date.fromtimestamp(v) for v in own_data], dtype=object
)

elif meta == "category":

# we have a categorical
categories = self.metadata
codes = own_data.ravel()

# if we have stored a NaN in the categories
# then strip it; in theory we could have BOTH
# -1s in the codes and nulls :<
if categories is None:
# Handle case of NaN-only categorical columns in which case
# the categories are an empty array; when this is stored,
# pytables cannot write a zero-len array, so on readback
# the categories would be None and `read_hdf()` would fail.
categories = Index([], dtype=np.float64)
else:
mask = isna(categories)
if mask.any():
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values

try:
self.data = self.data.astype(dtype, copy=False)
except TypeError:
self.data = self.data.astype("O", copy=False)
own_data = Categorical.from_codes(
codes, categories=categories, ordered=self.ordered
)

else:

try:
own_data = own_data.astype(dtype, copy=False)
except TypeError:
own_data = own_data.astype("O", copy=False)

# convert nans / decode
if _ensure_decoded(self.kind) == "string":
self.data = _unconvert_string_array(
self.data, nan_rep=nan_rep, encoding=encoding, errors=errors
own_data = _unconvert_string_array(
own_data, nan_rep=nan_rep, encoding=encoding, errors=errors
)

self.data = own_data

def get_attr(self):
""" get the data for this column """
self.values = getattr(self.attrs, self.kind_attr, None)
Expand Down Expand Up @@ -3613,8 +3604,6 @@ def read_axes(
nan_rep=self.nan_rep,
encoding=self.encoding,
errors=self.errors,
start=start,
stop=stop,
)

return True
Expand Down Expand Up @@ -4887,16 +4876,15 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"):
return data.reshape(shape)


def _maybe_convert(values: np.ndarray, val_kind, encoding, errors):
def _maybe_convert(values: np.ndarray, val_kind, encoding: str, errors: str):
val_kind = _ensure_decoded(val_kind)
if _need_convert(val_kind):
conv = _get_converter(val_kind, encoding, errors)
# conv = np.frompyfunc(conv, 1, 1)
values = conv(values)
return values


def _get_converter(kind: str, encoding, errors):
def _get_converter(kind: str, encoding: str, errors: str):
if kind == "datetime64":
return lambda x: np.asarray(x, dtype="M8[ns]")
elif kind == "string":
Expand Down