-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: use new pytables helper methods to de-duplicate _convert_index #30144
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2a7bc87
f749063
ecdf92d
500fca2
f237c1f
9c8313a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2250,16 +2250,7 @@ def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): | |
assert data is not None | ||
assert self.dtype is None | ||
|
||
if is_categorical_dtype(data.dtype): | ||
data = data.codes | ||
|
||
# For datetime64tz we need to drop the TZ in tests TODO: why? | ||
dtype_name = data.dtype.name.split("[")[0] | ||
|
||
if data.dtype.kind in ["m", "M"]: | ||
data = np.asarray(data.view("i8")) | ||
# TODO: we used to reshape for the dt64tz case, but no longer | ||
# doing that doesnt seem to break anything. why? | ||
data, dtype_name = _get_data_and_dtype_name(data) | ||
|
||
self.data = data | ||
self.dtype = dtype_name | ||
|
@@ -2321,6 +2312,9 @@ def get_atom_coltype(cls, kind: str) -> Type["Col"]: | |
if kind.startswith("uint"): | ||
k4 = kind[4:] | ||
col_name = f"UInt{k4}Col" | ||
elif kind.startswith("period"): | ||
# we store as integer | ||
col_name = "Int64Col" | ||
else: | ||
kcap = kind.capitalize() | ||
col_name = f"{kcap}Col" | ||
|
@@ -4652,37 +4646,21 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): | |
assert isinstance(name, str) | ||
|
||
index_name = index.name | ||
|
||
if isinstance(index, DatetimeIndex): | ||
converted = index.asi8 | ||
return IndexCol( | ||
name, | ||
converted, | ||
"datetime64", | ||
_tables().Int64Col(), | ||
freq=index.freq, | ||
tz=index.tz, | ||
index_name=index_name, | ||
) | ||
elif isinstance(index, TimedeltaIndex): | ||
converted = index.asi8 | ||
converted, dtype_name = _get_data_and_dtype_name(index) | ||
kind = _dtype_to_kind(dtype_name) | ||
atom = DataIndexableCol._get_atom(converted) | ||
|
||
if isinstance(index, Int64Index): | ||
# Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, | ||
# in which case "kind" is "integer", "integer", "datetime64", | ||
# "timedelta64", and "integer", respectively. | ||
return IndexCol( | ||
name, | ||
converted, | ||
"timedelta64", | ||
_tables().Int64Col(), | ||
freq=index.freq, | ||
index_name=index_name, | ||
) | ||
elif isinstance(index, (Int64Index, PeriodIndex)): | ||
atom = _tables().Int64Col() | ||
# avoid to store ndarray of Period objects | ||
return IndexCol( | ||
name, | ||
index._ndarray_values, | ||
"integer", | ||
atom, | ||
values=converted, | ||
kind=kind, | ||
typ=atom, | ||
freq=getattr(index, "freq", None), | ||
tz=getattr(index, "tz", None), | ||
index_name=index_name, | ||
) | ||
|
||
|
@@ -4701,8 +4679,6 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): | |
name, converted, "date", _tables().Time32Col(), index_name=index_name, | ||
) | ||
elif inferred_type == "string": | ||
# atom = _tables().ObjectAtom() | ||
# return np.asarray(values, dtype='O'), 'object', atom | ||
|
||
converted = _convert_string_array(values, encoding, errors) | ||
itemsize = converted.dtype.itemsize | ||
|
@@ -4714,30 +4690,15 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): | |
index_name=index_name, | ||
) | ||
|
||
elif inferred_type == "integer": | ||
# take a guess for now, hope the values fit | ||
atom = _tables().Int64Col() | ||
elif inferred_type in ["integer", "floating"]: | ||
return IndexCol( | ||
name, | ||
np.asarray(values, dtype=np.int64), | ||
"integer", | ||
atom, | ||
index_name=index_name, | ||
) | ||
elif inferred_type == "floating": | ||
atom = _tables().Float64Col() | ||
return IndexCol( | ||
name, | ||
np.asarray(values, dtype=np.float64), | ||
"float", | ||
atom, | ||
index_name=index_name, | ||
name, values=converted, kind=kind, typ=atom, index_name=index_name, | ||
) | ||
else: | ||
assert isinstance(converted, np.ndarray) and converted.dtype == object | ||
assert kind == "object", kind | ||
atom = _tables().ObjectAtom() | ||
return IndexCol( | ||
name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name, | ||
) | ||
return IndexCol(name, converted, kind, atom, index_name=index_name,) | ||
|
||
|
||
def _unconvert_index(data, kind: str, encoding=None, errors="strict"): | ||
|
@@ -4964,21 +4925,47 @@ def _dtype_to_kind(dtype_str: str) -> str: | |
kind = "complex" | ||
elif dtype_str.startswith("int") or dtype_str.startswith("uint"): | ||
kind = "integer" | ||
elif dtype_str.startswith("date"): | ||
# in tests this is always "datetime64" | ||
kind = "datetime" | ||
elif dtype_str.startswith("datetime64"): | ||
kind = "datetime64" | ||
elif dtype_str.startswith("timedelta"): | ||
kind = "timedelta" | ||
kind = "timedelta64" | ||
elif dtype_str.startswith("bool"): | ||
kind = "bool" | ||
elif dtype_str.startswith("category"): | ||
kind = "category" | ||
elif dtype_str.startswith("period"): | ||
# We store the `freq` attr so we can restore from integers | ||
kind = "integer" | ||
elif dtype_str == "object": | ||
kind = "object" | ||
else: | ||
raise ValueError(f"cannot interpret dtype of [{dtype_str}]") | ||
|
||
return kind | ||
|
||
|
||
def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): | ||
""" | ||
Convert the passed data into a storable form and a dtype string. | ||
""" | ||
if is_categorical_dtype(data.dtype): | ||
data = data.codes | ||
|
||
# For datetime64tz we need to drop the TZ in tests TODO: why? | ||
dtype_name = data.dtype.name.split("[")[0] | ||
|
||
if data.dtype.kind in ["m", "M"]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use needs_i8_conversion (followon ok) |
||
data = np.asarray(data.view("i8")) | ||
# TODO: we used to reshape for the dt64tz case, but no longer | ||
# doing that doesnt seem to break anything. why? | ||
|
||
elif isinstance(data, PeriodIndex): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can likely use extract_array here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would give PeriodArray, we need to get i8 back here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right once you have the arrays, then you can simply .view('i8') and you are done (pretty generically) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok i see what you're saying now. will do on next pass (along with needs_i8 mentioned above) |
||
data = data.asi8 | ||
|
||
data = np.asarray(data) | ||
return data, dtype_name | ||
|
||
|
||
class Selection: | ||
""" | ||
Carries out a selection operation on a tables.Table object. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we may have very little coverage for PeriodIndex FYI