Skip to content

REF: use new pytables helper methods to de-duplicate _convert_index #30144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 10, 2019
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 66 additions & 43 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2247,16 +2247,7 @@ def set_data(self, data: Union[np.ndarray, ABCExtensionArray]):
assert data is not None
assert self.dtype is None

if is_categorical_dtype(data.dtype):
data = data.codes

# For datetime64tz we need to drop the TZ in tests TODO: why?
dtype_name = data.dtype.name.split("[")[0]

if data.dtype.kind in ["m", "M"]:
data = np.asarray(data.view("i8"))
# TODO: we used to reshape for the dt64tz case, but no longer
# doing that doesnt seem to break anything. why?
data, dtype_name = _get_data_and_dtype_name(data)

self.data = data
self.dtype = dtype_name
Expand Down Expand Up @@ -2318,6 +2309,9 @@ def get_atom_coltype(cls, kind: str) -> Type["Col"]:
if kind.startswith("uint"):
k4 = kind[4:]
col_name = f"UInt{k4}Col"
elif kind.startswith("period"):
# we store as integer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may have very little coverage for PeriodIndex FYI

col_name = "Int64Col"
else:
kcap = kind.capitalize()
col_name = f"{kcap}Col"
Expand Down Expand Up @@ -4612,36 +4606,45 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"):
assert isinstance(name, str)

index_name = index.name
converted, dtype_name = _get_data_and_dtype_name(index)
kind = _dtype_to_kind(dtype_name)
atom = DataIndexableCol._get_atom(converted)

if isinstance(index, DatetimeIndex):
converted = index.asi8
assert isinstance(converted, np.ndarray) and converted.dtype == "i8"
assert kind == "datetime64", kind
assert isinstance(atom, _tables().Int64Col), atom.dtype
return IndexCol(
name,
converted,
"datetime64",
_tables().Int64Col(),
values=converted,
kind=kind,
typ=atom,
freq=index.freq,
tz=index.tz,
index_name=index_name,
)
elif isinstance(index, TimedeltaIndex):
converted = index.asi8
assert isinstance(converted, np.ndarray) and converted.dtype == "i8"
assert kind == "timedelta64", kind
assert isinstance(atom, _tables().Int64Col), atom.dtype
return IndexCol(
name,
converted,
"timedelta64",
_tables().Int64Col(),
values=converted,
kind=kind,
typ=atom,
freq=index.freq,
index_name=index_name,
)
elif isinstance(index, (Int64Index, PeriodIndex)):
atom = _tables().Int64Col()
# avoid to store ndarray of Period objects
assert isinstance(converted, np.ndarray) and converted.dtype == "i8"
assert kind == "integer", kind
assert isinstance(atom, _tables().Int64Col), atom.dtype
return IndexCol(
name,
index._ndarray_values,
"integer",
atom,
values=converted,
kind=kind,
typ=atom,
freq=getattr(index, "freq", None),
index_name=index_name,
)
Expand All @@ -4661,8 +4664,6 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"):
name, converted, "date", _tables().Time32Col(), index_name=index_name,
)
elif inferred_type == "string":
# atom = _tables().ObjectAtom()
# return np.asarray(values, dtype='O'), 'object', atom

converted = _convert_string_array(values, encoding, errors)
itemsize = converted.dtype.itemsize
Expand All @@ -4676,28 +4677,24 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"):

elif inferred_type == "integer":
# take a guess for now, hope the values fit
atom = _tables().Int64Col()
assert isinstance(converted, np.ndarray) and converted.dtype == "i8"
assert kind == "integer", kind
assert isinstance(atom, _tables().Int64Col), atom.dtype
return IndexCol(
name,
np.asarray(values, dtype=np.int64),
"integer",
atom,
index_name=index_name,
name, values=converted, kind=kind, typ=atom, index_name=index_name,
)
elif inferred_type == "floating":
atom = _tables().Float64Col()
assert isinstance(converted, np.ndarray) and converted.dtype == "f8"
assert kind == "float", kind
assert isinstance(atom, _tables().Float64Col), atom.dtype
return IndexCol(
name,
np.asarray(values, dtype=np.float64),
"float",
atom,
index_name=index_name,
name, values=converted, kind=kind, typ=atom, index_name=index_name,
)
else:
assert isinstance(converted, np.ndarray) and converted.dtype == object
assert kind == "object", kind
atom = _tables().ObjectAtom()
return IndexCol(
name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name,
)
return IndexCol(name, converted, kind, atom, index_name=index_name,)


def _unconvert_index(data, kind: str, encoding=None, errors="strict"):
Expand Down Expand Up @@ -4924,21 +4921,47 @@ def _dtype_to_kind(dtype_str: str) -> str:
kind = "complex"
elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
kind = "integer"
elif dtype_str.startswith("date"):
# in tests this is always "datetime64"
kind = "datetime"
elif dtype_str.startswith("datetime64"):
kind = "datetime64"
elif dtype_str.startswith("timedelta"):
kind = "timedelta"
kind = "timedelta64"
elif dtype_str.startswith("bool"):
kind = "bool"
elif dtype_str.startswith("category"):
kind = "category"
elif dtype_str.startswith("period"):
# We store the `freq` attr so we can restore from integers
kind = "integer"
elif dtype_str == "object":
kind = "object"
else:
raise ValueError(f"cannot interpret dtype of [{dtype_str}]")

return kind


def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]):
"""
Convert the passed data into a storable form and a dtype string.
"""
if is_categorical_dtype(data.dtype):
data = data.codes

# For datetime64tz we need to drop the TZ in tests TODO: why?
dtype_name = data.dtype.name.split("[")[0]

if data.dtype.kind in ["m", "M"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use needs_i8_conversion (followon ok)

data = np.asarray(data.view("i8"))
# TODO: we used to reshape for the dt64tz case, but no longer
# doing that doesnt seem to break anything. why?

elif isinstance(data, PeriodIndex):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can likely use extract_array here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would give PeriodArray, we need to get i8 back here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right once you have the arrays, then you can simply .view('i8') and you are done (pretty generically)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok i see what you're saying now. will do on next pass (along with needs_i8 mentioned above)

data = data.asi8

data = np.asarray(data)
return data, dtype_name


class Selection:
"""
Carries out a selection operation on a tables.Table object.
Expand Down