Skip to content

REF: dont alter state in pytables read_axes #30184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 12, 2019
Merged
134 changes: 83 additions & 51 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1965,7 +1965,9 @@ def is_indexed(self) -> bool:
return getattr(self.table.cols, self.cname).is_indexed # type: ignore

def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
""" set the values from this selection: take = take ownership """
"""
Convert the data from this selection to the appropriate pandas type.
"""
assert isinstance(values, np.ndarray), type(values)

# values is a recarray
Expand All @@ -1991,7 +1993,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
new_pd_index = Index(values, **kwargs)

new_pd_index = _set_tz(new_pd_index, self.tz)
self.values = new_pd_index
return new_pd_index, new_pd_index

def take_data(self):
""" return the values & release the memory """
Expand Down Expand Up @@ -2145,7 +2147,7 @@ def is_indexed(self) -> bool:

def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
"""
Set the values from this selection.
Convert the data from this selection to the appropriate pandas type.

Parameters
----------
Expand All @@ -2155,7 +2157,9 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
errors : str
"""
assert isinstance(values, np.ndarray), type(values)
self.values = Int64Index(np.arange(len(values)))

values = Int64Index(np.arange(len(values)))
return values, values

def set_attr(self):
pass
Expand Down Expand Up @@ -2340,8 +2344,20 @@ def validate_attr(self, append):
)

def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
"""set the data from this selection (and convert to the correct dtype
if we can)
"""
Convert the data from this selection to the appropriate pandas type.

Parameters
----------
values : np.ndarray
nan_rep :
encoding : str
errors : str

Returns
-------
index : listlike to become an Index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally type at some point (e.g. return value)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah, ATM we're not consistent about what this is (e.g. sometimes list) so id like to to get this more specific

data : ndarraylike to become a column
"""
assert isinstance(values, np.ndarray), type(values)

Expand All @@ -2351,44 +2367,50 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

assert self.typ is not None
if self.dtype is None:
self.set_data(values)
# Note: in tests we never have timedelta64 or datetime64,
# so the _get_data_and_dtype_name may be unnecessary
converted, dtype_name = _get_data_and_dtype_name(values)
kind = _dtype_to_kind(dtype_name)
else:
self.data = values
converted = values
dtype_name = self.dtype
kind = self.kind

own_data = self.data
assert isinstance(own_data, np.ndarray) # for mypy
assert isinstance(converted, np.ndarray) # for mypy

# use the meta if needed
meta = _ensure_decoded(self.meta)
metadata = self.metadata
ordered = self.ordered
tz = self.tz

assert self.dtype is not None

assert dtype_name is not None
# convert to the correct dtype
dtype = _ensure_decoded(self.dtype)
dtype = _ensure_decoded(dtype_name)

# reverse converts
if dtype == "datetime64":

# recreate with tz if indicated
own_data = _set_tz(own_data, self.tz, coerce=True)
converted = _set_tz(converted, tz, coerce=True)

elif dtype == "timedelta64":
own_data = np.asarray(own_data, dtype="m8[ns]")
converted = np.asarray(converted, dtype="m8[ns]")
elif dtype == "date":
try:
own_data = np.asarray(
[date.fromordinal(v) for v in own_data], dtype=object
converted = np.asarray(
[date.fromordinal(v) for v in converted], dtype=object
)
except ValueError:
own_data = np.asarray(
[date.fromtimestamp(v) for v in own_data], dtype=object
converted = np.asarray(
[date.fromtimestamp(v) for v in converted], dtype=object
)

elif meta == "category":

# we have a categorical
categories = self.metadata
codes = own_data.ravel()
categories = metadata
codes = converted.ravel()

# if we have stored a NaN in the categories
# then strip it; in theory we could have BOTH
Expand All @@ -2405,24 +2427,24 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values

own_data = Categorical.from_codes(
codes, categories=categories, ordered=self.ordered
converted = Categorical.from_codes(
codes, categories=categories, ordered=ordered
)

else:

try:
own_data = own_data.astype(dtype, copy=False)
converted = converted.astype(dtype, copy=False)
except TypeError:
own_data = own_data.astype("O", copy=False)
converted = converted.astype("O", copy=False)

# convert nans / decode
if _ensure_decoded(self.kind) == "string":
own_data = _unconvert_string_array(
own_data, nan_rep=nan_rep, encoding=encoding, errors=errors
if _ensure_decoded(kind) == "string":
converted = _unconvert_string_array(
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
)

self.data = own_data
return self.values, converted

def set_attr(self):
""" set the data for this column """
Expand Down Expand Up @@ -3554,9 +3576,9 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
)
v.create_index(**kw)

def read_axes(
def _read_axes(
self, where, start: Optional[int] = None, stop: Optional[int] = None
) -> bool:
) -> List[Tuple[ArrayLike, ArrayLike]]:
"""
Create the axes sniffed from the table.

Expand All @@ -3568,32 +3590,26 @@ def read_axes(

Returns
-------
bool
Indicates success.
List[Tuple[index_values, column_values]]
"""

# validate the version
self.validate_version(where)

# infer the data kind
if not self.infer_axes():
return False

# create the selection
selection = Selection(self, where=where, start=start, stop=stop)
values = selection.select()

results = []
# convert the data
for a in self.axes:
a.set_info(self.info)
a.convert(
res = a.convert(
values,
nan_rep=self.nan_rep,
encoding=self.encoding,
errors=self.errors,
)
results.append(res)

return True
return results

def get_object(self, obj, transposed: bool):
""" return the data for this obj """
Expand Down Expand Up @@ -4040,13 +4056,13 @@ def read_column(
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
a.set_info(self.info)
a.convert(
col_values = a.convert(
c[start:stop],
nan_rep=self.nan_rep,
encoding=self.encoding,
errors=self.errors,
)
return Series(_set_tz(a.take_data(), a.tz), name=column)
return Series(_set_tz(col_values[1], a.tz), name=column)

raise KeyError(f"column [{column}] not found in the table")

Expand Down Expand Up @@ -4330,34 +4346,50 @@ def read(
stop: Optional[int] = None,
):

if not self.read_axes(where=where, start=start, stop=stop):
# validate the version
self.validate_version(where)

# infer the data kind
if not self.infer_axes():
return None

result = self._read_axes(where=where, start=start, stop=stop)

info = (
self.info.get(self.non_index_axes[0][0], dict())
if len(self.non_index_axes)
else dict()
)
index = self.index_axes[0].values

inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
assert len(inds) == 1
ind = inds[0]

index = result[ind][0]

frames = []
for a in self.values_axes:
for i, a in enumerate(self.axes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confusing y ou are using self.axes here again

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need i to do the lookup result[i] and a to do the check just below. there may be a nicer way to do this if/when values_axes ceases to be stateful

if a not in self.values_axes:
continue
index_vals, cvalues = result[i]

# we could have a multi-index constructor here
# ensure_index doesn't recognized our list-of-tuples here
if info.get("type") == "MultiIndex":
cols = MultiIndex.from_tuples(a.values)
cols = MultiIndex.from_tuples(index_vals)
else:
cols = Index(a.values)
cols = Index(index_vals)

names = info.get("names")
if names is not None:
cols.set_names(names, inplace=True)

if self.is_transposed:
values = a.cvalues
values = cvalues
index_ = cols
cols_ = Index(index, name=getattr(index, "name", None))
else:
values = a.cvalues.T
values = cvalues.T
index_ = Index(index, name=getattr(index, "name", None))
cols_ = cols

Expand Down