From 2a7bc87b6eec6a3af357a9ff3d6bdd45ea7a7dee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Dec 2019 09:39:40 -0800 Subject: [PATCH 1/4] REF: implement _get_data_and_dtype_name in pytables --- pandas/io/pytables.py | 49 ++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 22bca5d3a5659..5763991d365fa 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2262,16 +2262,7 @@ def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): assert data is not None assert self.dtype is None - if is_categorical_dtype(data.dtype): - data = data.codes - - # For datetime64tz we need to drop the TZ in tests TODO: why? - dtype_name = data.dtype.name.split("[")[0] - - if data.dtype.kind in ["m", "M"]: - data = np.asarray(data.view("i8")) - # TODO: we used to reshape for the dt64tz case, but no longer - # doing that doesnt seem to break anything. why? + data, dtype_name = _get_data_and_dtype_name(data) self.data = data self.dtype = dtype_name @@ -4623,9 +4614,12 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): assert isinstance(name, str) index_name = index.name + converted, dtype_name = _get_data_and_dtype_name(index) + kind = _dtype_to_kind(dtype_name) if isinstance(index, DatetimeIndex): converted = index.asi8 + assert kind == "datetime64", kind return IndexCol( name, converted, @@ -4637,6 +4631,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): ) elif isinstance(index, TimedeltaIndex): converted = index.asi8 + assert kind == "timedelta64", kind return IndexCol( name, converted, @@ -4648,6 +4643,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects + assert kind == "integer", kind return IndexCol( name, index._ndarray_values, @@ -4687,6 +4683,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): elif inferred_type == "integer": # take a guess for now, hope the values fit + assert kind == "integer", kind atom = _tables().Int64Col() return IndexCol( name, @@ -4696,6 +4693,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): index_name=index_name, ) elif inferred_type == "floating": + assert kind == "float", kind atom = _tables().Float64Col() return IndexCol( name, @@ -4705,6 +4703,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): index_name=index_name, ) else: + assert kind == "object", kind atom = _tables().ObjectAtom() return IndexCol( name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name, @@ -4936,21 +4935,43 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "complex" elif dtype_str.startswith("int") or dtype_str.startswith("uint"): kind = "integer" - elif dtype_str.startswith("date"): - # in tests this is always "datetime64" - kind = "datetime" + elif dtype_str.startswith("datetime64"): + kind = "datetime64" elif dtype_str.startswith("timedelta"): - kind = "timedelta" + kind = "timedelta64" elif dtype_str.startswith("bool"): kind = "bool" elif dtype_str.startswith("category"): kind = "category" + elif dtype_str.startswith("period"): + # We store the `freq` attr so we can restore from integers + kind = "integer" + elif dtype_str == "object": + kind = "object" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") return kind +def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): + """ + Convert the passed data into a storable form and a dtype string. + """ + if is_categorical_dtype(data.dtype): + data = data.codes + + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = data.dtype.name.split("[")[0] + + if data.dtype.kind in ["m", "M"]: + data = np.asarray(data.view("i8")) + # TODO: we used to reshape for the dt64tz case, but no longer + # doing that doesnt seem to break anything. why? + + return data, dtype_name + + class Selection: """ Carries out a selection operation on a tables.Table object. From ecdf92d19873a0d2e634ce1eeb86c18c502c37f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Dec 2019 10:28:26 -0800 Subject: [PATCH 2/4] REF: use _get_atom to de-duplicate _convert_index --- pandas/io/pytables.py | 58 ++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d3cd5732acf46..e8fd666242ae7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2309,6 +2309,9 @@ def get_atom_coltype(cls, kind: str) -> Type["Col"]: if kind.startswith("uint"): k4 = kind[4:] col_name = f"UInt{k4}Col" + elif kind.startswith("period"): + # we store as integer + col_name = "Int64Col" else: kcap = kind.capitalize() col_name = f"{kcap}Col" @@ -4605,39 +4608,43 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): index_name = index.name converted, dtype_name = _get_data_and_dtype_name(index) kind = _dtype_to_kind(dtype_name) + atom = DataIndexableCol._get_atom(converted) if isinstance(index, DatetimeIndex): - converted = index.asi8 + assert isinstance(converted, np.ndarray) and converted.dtype == "i8" assert kind == "datetime64", kind + assert isinstance(atom, _tables().Int64Col), atom.dtype return IndexCol( name, - converted, - "datetime64", - _tables().Int64Col(), + values=converted, + kind=kind, + typ=atom, freq=index.freq, tz=index.tz, index_name=index_name, ) elif isinstance(index, TimedeltaIndex): - converted = index.asi8 + assert isinstance(converted, np.ndarray) and converted.dtype == "i8" assert kind == "timedelta64", kind + assert isinstance(atom, _tables().Int64Col), atom.dtype return IndexCol( name, - converted, - "timedelta64", - _tables().Int64Col(), + values=converted, + kind=kind, + typ=atom, freq=index.freq, index_name=index_name, ) elif isinstance(index, (Int64Index, PeriodIndex)): - atom = _tables().Int64Col() # avoid to store ndarray of Period objects + assert isinstance(converted, np.ndarray) and converted.dtype == "i8" assert kind == "integer", kind + assert isinstance(atom, _tables().Int64Col), atom.dtype return IndexCol( name, - index._ndarray_values, - "integer", - atom, + values=converted, + kind=kind, + typ=atom, freq=getattr(index, "freq", None), index_name=index_name, ) @@ -4657,8 +4664,6 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): name, converted, "date", _tables().Time32Col(), index_name=index_name, ) elif inferred_type == "string": - # atom = _tables().ObjectAtom() - # return np.asarray(values, dtype='O'), 'object', atom converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize @@ -4672,30 +4677,33 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): elif inferred_type == "integer": # take a guess for now, hope the values fit + assert isinstance(converted, np.ndarray) and converted.dtype == "i8" assert kind == "integer", kind - atom = _tables().Int64Col() + assert isinstance(atom, _tables().Int64Col), atom.dtype return IndexCol( name, - np.asarray(values, dtype=np.int64), - "integer", - atom, + values=converted, + kind=kind, + typ=atom, index_name=index_name, ) elif inferred_type == "floating": + assert isinstance(converted, np.ndarray) and converted.dtype == "f8" assert kind == "float", kind - atom = _tables().Float64Col() + assert isinstance(atom, _tables().Float64Col), atom.dtype return IndexCol( name, - np.asarray(values, dtype=np.float64), - "float", - atom, + values=converted, + kind=kind, + typ=atom, index_name=index_name, ) else: + assert isinstance(converted, np.ndarray) and converted.dtype == object assert kind == "object", kind atom = _tables().ObjectAtom() return IndexCol( - name, np.asarray(values, dtype="O"), "object", atom, index_name=index_name, + name, converted, kind, atom, index_name=index_name, ) @@ -4957,6 +4965,10 @@ def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): # TODO: we used to reshape for the dt64tz case, but no longer # doing that doesnt seem to break anything. why? + elif isinstance(data, PeriodIndex): + data = data.asi8 + + data = np.asarray(data) return data, dtype_name From 500fca2c80ed4ab599f25a767ffa95a9b772d102 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Dec 2019 11:03:42 -0800 Subject: [PATCH 3/4] blackify --- pandas/io/pytables.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e8fd666242ae7..6da853773d4f4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4681,30 +4681,20 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): assert kind == "integer", kind assert isinstance(atom, _tables().Int64Col), atom.dtype return IndexCol( - name, - values=converted, - kind=kind, - typ=atom, - index_name=index_name, + name, values=converted, kind=kind, typ=atom, index_name=index_name, ) elif inferred_type == "floating": assert isinstance(converted, np.ndarray) and converted.dtype == "f8" assert kind == "float", kind assert isinstance(atom, _tables().Float64Col), atom.dtype return IndexCol( - name, - values=converted, - kind=kind, - typ=atom, - index_name=index_name, + name, values=converted, kind=kind, typ=atom, index_name=index_name, ) else: assert isinstance(converted, np.ndarray) and converted.dtype == object assert kind == "object", kind atom = _tables().ObjectAtom() - return IndexCol( - name, converted, kind, atom, index_name=index_name, - ) + return IndexCol(name, converted, kind, atom, index_name=index_name,) def _unconvert_index(data, kind: str, encoding=None, errors="strict"): From 9c8313a524fa500dcfaa1792fa108e8b18ef945a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Dec 2019 12:49:23 -0800 Subject: [PATCH 4/4] collapse redundant --- pandas/io/pytables.py | 48 ++++++------------------------------------- 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index baef07e799ac1..fc0c4ccf5b65b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4650,42 +4650,17 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): kind = _dtype_to_kind(dtype_name) atom = DataIndexableCol._get_atom(converted) - if isinstance(index, DatetimeIndex): - assert isinstance(converted, np.ndarray) and converted.dtype == "i8" - assert kind == "datetime64", kind - assert isinstance(atom, _tables().Int64Col), atom.dtype - return IndexCol( - name, - values=converted, - kind=kind, - typ=atom, - freq=index.freq, - tz=index.tz, - index_name=index_name, - ) - elif isinstance(index, TimedeltaIndex): - assert isinstance(converted, np.ndarray) and converted.dtype == "i8" - assert kind == "timedelta64", kind - assert isinstance(atom, _tables().Int64Col), atom.dtype - return IndexCol( - name, - values=converted, - kind=kind, - typ=atom, - freq=index.freq, - index_name=index_name, - ) - elif isinstance(index, (Int64Index, PeriodIndex)): - # avoid to store ndarray of Period objects - assert isinstance(converted, np.ndarray) and converted.dtype == "i8" - assert kind == "integer", kind - assert isinstance(atom, _tables().Int64Col), atom.dtype + if isinstance(index, Int64Index): + # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, + # in which case "kind" is "integer", "integer", "datetime64", + # "timedelta64", and "integer", respectively. return IndexCol( name, values=converted, kind=kind, typ=atom, freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), index_name=index_name, ) @@ -4715,18 +4690,7 @@ def _convert_index(name: str, index: Index, encoding=None, errors="strict"): index_name=index_name, ) - elif inferred_type == "integer": - # take a guess for now, hope the values fit - assert isinstance(converted, np.ndarray) and converted.dtype == "i8" - assert kind == "integer", kind - assert isinstance(atom, _tables().Int64Col), atom.dtype - return IndexCol( - name, values=converted, kind=kind, typ=atom, index_name=index_name, - ) - elif inferred_type == "floating": - assert isinstance(converted, np.ndarray) and converted.dtype == "f8" - assert kind == "float", kind - assert isinstance(atom, _tables().Float64Col), atom.dtype + elif inferred_type in ["integer", "floating"]: return IndexCol( name, values=converted, kind=kind, typ=atom, index_name=index_name, )