From d39d5e7a49bb1ea28b7a02f486757d6ba0986639 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 3 Jun 2013 16:21:32 -0400 Subject: [PATCH 1/7] TST: travis to build numexpr/tables on py3k --- ci/install.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/install.sh b/ci/install.sh index a091834a9570f..b748070db85aa 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -69,13 +69,11 @@ if ( ! $VENV_FILE_AVAILABLE ); then pip install $PIP_ARGS cython if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then - # installed explicitly above, to get the library as well - # sudo apt-get $APT_ARGS install libhdf5-serial-dev; - pip install numexpr - pip install tables pip install $PIP_ARGS xlwt fi + pip install numexpr + pip install tables pip install $PIP_ARGS matplotlib pip install $PIP_ARGS openpyxl pip install $PIP_ARGS xlrd>=0.9.0 From 870c6489fcd5b091deb6f5d952ad5a6d117e38cb Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 5 Jun 2013 13:29:22 -0400 Subject: [PATCH 2/7] DOC: docstring/release notes updates for py3k DOC: v0.11.1 updates --- README.rst | 1 - RELEASE.rst | 15 +++++++++++++++ doc/source/install.rst | 1 - doc/source/io.rst | 9 ++++----- doc/source/v0.11.1.txt | 3 +++ pandas/io/pytables.py | 10 ++++++---- 6 files changed, 28 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index daea702476ebc..85868176722bd 100644 --- a/README.rst +++ b/README.rst @@ -85,7 +85,6 @@ Optional dependencies - `Cython `__: Only necessary to build development version. Version 0.17.1 or higher. - `SciPy `__: miscellaneous statistical functions - `PyTables `__: necessary for HDF5-based storage - - Not yet supported on python >= 3 - `matplotlib `__: for plotting - `statsmodels `__ - Needed for parts of :mod:`pandas.stats` diff --git a/RELEASE.rst b/RELEASE.rst index 12d2389a8a59b..28c4ce8becbb0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -63,6 +63,10 @@ pandas 0.11.1 to append an index with a different name than the existing - support datelike columns with a timezone as data_columns (GH2852_) - table writing performance improvements. +<<<<<<< HEAD +======= + - support python3 (via ``PyTables 3.0.0``) +>>>>>>> 116ab91... DOC: docstring/release notes updates for py3k - Add modulo operator to Series, DataFrame - Add ``date`` method to DatetimeIndex - Simplified the API and added a describe method to Categorical @@ -79,10 +83,21 @@ pandas 0.11.1 **API Changes** +<<<<<<< HEAD - When removing an object from a ``HDFStore``, ``remove(key)`` raises ``KeyError`` if the key is not a valid store object. - In an ``HDFStore``, raise a ``TypeError`` on passing ``where`` or ``columns`` to select with a Storer; these are invalid parameters at this time +======= + - ``HDFStore`` + + - When removing an object, ``remove(key)`` raises + ``KeyError`` if the key is not a valid store object. + - raise a ``TypeError`` on passing ``where`` or ``columns`` + to select with a Storer; these are invalid parameters at this time + - can now specify an ``encoding`` option to ``append/put`` + to enable alternate encodings +>>>>>>> 116ab91... DOC: docstring/release notes updates for py3k - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) - Added mangle_dupe_cols option to read_table/csv, allowing users diff --git a/doc/source/install.rst b/doc/source/install.rst index 6868969c1b968..9dc8064da45e3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -95,7 +95,6 @@ Optional Dependencies version. Version 0.17.1 or higher. * `SciPy `__: miscellaneous statistical functions * `PyTables `__: necessary for HDF5-based storage - * Not yet supported on python >= 3 * `matplotlib `__: for plotting * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` diff --git a/doc/source/io.rst b/doc/source/io.rst index 802ab08e85932..1c615ca278668 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1300,12 +1300,11 @@ the high performance HDF5 format using the excellent `PyTables `__ library. See the :ref:`cookbook` for some advanced strategies -.. warning:: +.. note:: - ``PyTables`` 3.0.0 was recently released. This enables support for Python 3, - however, it has not been integrated into pandas as of yet. (Under Python 2, - ``PyTables`` version >= 2.3 is supported). - + ``PyTables`` 3.0.0 was recently released to enables support for Python 3. + Pandas should be fully compatible (and previously written stores should be + backwards compatible) with all ``PyTables`` >= 2.3 .. ipython:: python :suppress: diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index b2fee1acbc4d6..badb364d214d1 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -237,6 +237,9 @@ Enhancements pd.get_option('a.b') pd.get_option('b.c') + - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 + + Bug Fixes ~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5a480e08effba..b4d312d55104f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -522,15 +522,16 @@ def put(self, key, value, table=None, append=False, **kwargs): Parameters ---------- - key : object - value : {Series, DataFrame, Panel} - table : boolean, default False + key : object + value : {Series, DataFrame, Panel} + table : boolean, default False Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data - append : boolean, default False + append : boolean, default False For table data structures, append the input data to the existing table + encoding : default None, provide an encoding for strings """ self._write_to_group(key, value, table=table, append=append, **kwargs) @@ -595,6 +596,7 @@ def append(self, key, value, columns=None, **kwargs): nan_rep : string to use as string nan represenation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table + encoding : default None, provide an encoding for strings Notes ----- From aef951611f0552bfec14c211b7d350596d80f015 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 3 Jun 2013 17:41:01 -0400 Subject: [PATCH 3/7] ENH: provide py3k string decoding and compat --- RELEASE.rst | 12 ++ pandas/io/pytables.py | 290 +++++++++++++++++++------------ pandas/io/tests/test_pytables.py | 15 ++ pandas/lib.pyx | 9 +- 4 files changed, 210 insertions(+), 116 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 28c4ce8becbb0..c05bb526ab715 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -64,9 +64,13 @@ pandas 0.11.1 - support datelike columns with a timezone as data_columns (GH2852_) - table writing performance improvements. <<<<<<< HEAD +<<<<<<< HEAD ======= - support python3 (via ``PyTables 3.0.0``) >>>>>>> 116ab91... DOC: docstring/release notes updates for py3k +======= + - support py3 (via ``PyTables 3.0.0``) +>>>>>>> ab16d43... ENH: partial py3k support - Add modulo operator to Series, DataFrame - Add ``date`` method to DatetimeIndex - Simplified the API and added a describe method to Categorical @@ -83,21 +87,29 @@ pandas 0.11.1 **API Changes** +<<<<<<< HEAD <<<<<<< HEAD - When removing an object from a ``HDFStore``, ``remove(key)`` raises ``KeyError`` if the key is not a valid store object. - In an ``HDFStore``, raise a ``TypeError`` on passing ``where`` or ``columns`` to select with a Storer; these are invalid parameters at this time ======= +======= +>>>>>>> ab16d43... ENH: partial py3k support - ``HDFStore`` - When removing an object, ``remove(key)`` raises ``KeyError`` if the key is not a valid store object. - raise a ``TypeError`` on passing ``where`` or ``columns`` to select with a Storer; these are invalid parameters at this time +<<<<<<< HEAD - can now specify an ``encoding`` option to ``append/put`` to enable alternate encodings >>>>>>> 116ab91... DOC: docstring/release notes updates for py3k +======= + - can now specify an ``encoding`` option to ``append`` and ``select`` + to enable alternate encodings +>>>>>>> ab16d43... ENH: partial py3k support - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) - Added mangle_dupe_cols option to read_table/csv, allowing users diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b4d312d55104f..87590fe65b5bb 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -13,7 +13,8 @@ import numpy as np from pandas import ( - Series, TimeSeries, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index + Series, TimeSeries, DataFrame, Panel, Panel4D, Index, + MultiIndex, Int64Index, Timestamp ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex @@ -27,6 +28,7 @@ from pandas.core.index import Int64Index, _ensure_index import pandas.core.common as com from pandas.tools.merge import concat +from pandas.util import py3compat import pandas.lib as lib import pandas.algos as algos @@ -37,6 +39,9 @@ # versioning attribute _version = '0.10.1' +# PY3 encoding if we don't specify +_default_encoding = 'UTF-8' + class IncompatibilityWarning(Warning): pass incompatibility_doc = """ where criteria is being ignored as this version [%s] is too old (or not-defined), @@ -56,40 +61,40 @@ class PerformanceWarning(Warning): pass # map object types _TYPE_MAP = { - Series : 'series', - SparseSeries : 'sparse_series', - TimeSeries : 'series', - DataFrame : 'frame', - SparseDataFrame : 'sparse_frame', - Panel : 'wide', - Panel4D : 'ndim', - SparsePanel : 'sparse_panel' + Series : u'series', + SparseSeries : u'sparse_series', + TimeSeries : u'series', + DataFrame : u'frame', + SparseDataFrame : u'sparse_frame', + Panel : u'wide', + Panel4D : u'ndim', + SparsePanel : u'sparse_panel' } # storer class map _STORER_MAP = { - 'TimeSeries' : 'LegacySeriesStorer', - 'Series' : 'LegacySeriesStorer', - 'DataFrame' : 'LegacyFrameStorer', - 'DataMatrix' : 'LegacyFrameStorer', - 'series' : 'SeriesStorer', - 'sparse_series' : 'SparseSeriesStorer', - 'frame' : 'FrameStorer', - 'sparse_frame' : 'SparseFrameStorer', - 'wide' : 'PanelStorer', - 'sparse_panel' : 'SparsePanelStorer', + u'TimeSeries' : 'LegacySeriesStorer', + u'Series' : 'LegacySeriesStorer', + u'DataFrame' : 'LegacyFrameStorer', + u'DataMatrix' : 'LegacyFrameStorer', + u'series' : 'SeriesStorer', + u'sparse_series' : 'SparseSeriesStorer', + u'frame' : 'FrameStorer', + u'sparse_frame' : 'SparseFrameStorer', + u'wide' : 'PanelStorer', + u'sparse_panel' : 'SparsePanelStorer', } # table class map _TABLE_MAP = { - 'generic_table' : 'GenericTable', - 'appendable_frame' : 'AppendableFrameTable', - 'appendable_multiframe' : 'AppendableMultiFrameTable', - 'appendable_panel' : 'AppendablePanelTable', - 'appendable_ndim' : 'AppendableNDimTable', - 'worm' : 'WORMTable', - 'legacy_frame' : 'LegacyFrameTable', - 'legacy_panel' : 'LegacyPanelTable', + u'generic_table' : 'GenericTable', + u'appendable_frame' : 'AppendableFrameTable', + u'appendable_multiframe' : 'AppendableMultiFrameTable', + u'appendable_panel' : 'AppendablePanelTable', + u'appendable_ndim' : 'AppendableNDimTable', + u'worm' : 'WORMTable', + u'legacy_frame' : 'LegacyFrameTable', + u'legacy_panel' : 'LegacyPanelTable', } # axes map @@ -201,7 +206,7 @@ class HDFStore(object): complevel : int, 1-9, default 0 If a complib is specified compression will be applied where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None + complib : {'zliu', 'bzip2', 'lzo', 'blosc', None}, default None If complevel is > 0 apply compression to objects written in the store wherever possible fletcher32 : bool, default False @@ -694,7 +699,7 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != 'table') ] + return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -772,8 +777,8 @@ def error(t): _tables() if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): - pt = 'frame_table' - tt = 'generic_table' + pt = u'frame_table' + tt = u'generic_table' else: raise TypeError("cannot create a storer if the object is not existing nor a value are passed") else: @@ -785,10 +790,10 @@ def error(t): # we are actually a table if table or append: - pt += '_table' + pt += u'_table' # a storer node - if 'table' not in pt: + if u'table' not in pt: try: return globals()[_STORER_MAP[pt]](self, group, **kwargs) except: @@ -800,26 +805,26 @@ def error(t): # if we are a writer, determin the tt if value is not None: - if pt == 'frame_table': + if pt == u'frame_table': index = getattr(value,'index',None) if index is not None: if index.nlevels == 1: - tt = 'appendable_frame' + tt = u'appendable_frame' elif index.nlevels > 1: - tt = 'appendable_multiframe' - elif pt == 'wide_table': - tt = 'appendable_panel' - elif pt == 'ndim_table': - tt = 'appendable_ndim' + tt = u'appendable_multiframe' + elif pt == u'wide_table': + tt = u'appendable_panel' + elif pt == u'ndim_table': + tt = u'appendable_ndim' else: # distiguish between a frame/table - tt = 'legacy_panel' + tt = u'legacy_panel' try: fields = group.table._v_attrs.fields - if len(fields) == 1 and fields[0] == 'value': - tt = 'legacy_frame' + if len(fields) == 1 and fields[0] == u'value': + tt = u'legacy_frame' except: pass @@ -892,7 +897,7 @@ class TableIterator(object): def __init__(self, func, nrows, start=None, stop=None, chunksize=None): self.func = func - self.nrows = nrows + self.nrows = nrows or 0 self.start = start or 0 if stop is None: @@ -1017,7 +1022,7 @@ def infer(self, table): new_self.get_attr() return new_self - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the values from this selection: take = take ownership """ try: values = values[self.cname] @@ -1032,13 +1037,13 @@ def convert(self, values, nan_rep): if self.index_name is not None: kwargs['name'] = self.index_name try: - self.values = Index(_maybe_convert(values, self.kind), **kwargs) + self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) except: # if the output freq is different that what we recorded, then infer it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind), **kwargs) + self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) return self def take_data(self): @@ -1070,7 +1075,7 @@ def __iter__(self): def maybe_set_size(self, min_itemsize=None, **kwargs): """ maybe set a string col itemsize: min_itemsize can be an interger or a dict with this columns name with an integer size """ - if self.kind == 'string': + if self.kind == u'string': if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -1090,7 +1095,7 @@ def validate_col(self, itemsize=None): # validate this column for string truncation (or reset to the max size) dtype = getattr(self, 'dtype', None) - if self.kind == 'string': + if self.kind == u'string': c = self.col if c is not None: @@ -1169,7 +1174,7 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the values from this selection: take = take ownership """ self.values = Int64Index(np.arange(self.table.nrows)) @@ -1248,22 +1253,25 @@ def take_data(self): def set_kind(self): # set my kind if we can if self.dtype is not None: - if self.dtype.startswith('string'): + dtype = self.dtype + if dtype.startswith(u'string') or dtype.startswith(u'bytes'): self.kind = 'string' - elif self.dtype.startswith('float'): + elif dtype.startswith(u'float'): self.kind = 'float' - elif self.dtype.startswith('int'): + elif dtype.startswith(u'int') or dtype.startswith(u'uint'): self.kind = 'integer' - elif self.dtype.startswith('date'): + elif dtype.startswith(u'date'): self.kind = 'datetime' - elif self.dtype.startswith('bool'): + elif dtype.startswith(u'bool'): self.kind = 'bool' + else: + raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) # set my typ if we need if self.typ is None: self.typ = getattr(self.description,self.cname,None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1306,7 +1314,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep) + self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) else: self.set_atom_data(block) @@ -1315,7 +1323,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): + def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep) data = block.values @@ -1336,7 +1344,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): # itemsize is the maximum length of a string (along any dimension) - itemsize = lib.max_len_string_array(data.ravel()) + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) # specified min_itemsize? if isinstance(min_itemsize, dict): @@ -1353,10 +1361,10 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): self.itemsize = itemsize self.kind = 'string' self.typ = self.get_atom_string(block, itemsize) - self.set_data(self.convert_string_data(data, itemsize)) + self.set_data(self.convert_string_data(data, itemsize, encoding)) - def convert_string_data(self, data, itemsize): - return data.astype('S%s' % itemsize) + def convert_string_data(self, data, itemsize, encoding): + return _convert_string_array(data, encoding, itemsize) def get_atom_coltype(self): """ return the PyTables column class for this column """ @@ -1409,7 +1417,7 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing items dtype" " in table!") - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ try: values = values[self.cname] @@ -1421,7 +1429,7 @@ def convert(self, values, nan_rep): if self.dtype is not None: # reverse converts - if self.dtype == 'datetime64': + if self.dtype == u'datetime64': # recreate the timezone if self.tz is not None: @@ -1434,10 +1442,10 @@ def convert(self, values, nan_rep): else: self.data = np.asarray(self.data, dtype='M8[ns]') - elif self.dtype == 'date': + elif self.dtype == u'date': self.data = np.array( [date.fromtimestamp(v) for v in self.data], dtype=object) - elif self.dtype == 'datetime': + elif self.dtype == u'datetime': self.data = np.array( [datetime.fromtimestamp(v) for v in self.data], dtype=object) @@ -1448,16 +1456,16 @@ def convert(self, values, nan_rep): except: self.data = self.data.astype('O') - # convert nans - if self.kind == 'string': - self.data = lib.array_replace_from_nan_rep( - self.data.ravel(), nan_rep).reshape(self.data.shape) + # convert nans / decode + if self.kind == u'string': + self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1473,7 +1481,7 @@ class DataIndexableCol(DataCol): @property def is_searchable(self): - return self.kind == 'string' + return self.kind == u'string' def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize) @@ -1702,7 +1710,7 @@ def read_array(self, key): else: ret = data - if dtype == 'datetime64': + if dtype == u'datetime64': ret = np.array(ret, dtype='M8[ns]') if transposed: @@ -1713,13 +1721,13 @@ def read_array(self, key): def read_index(self, key): variety = getattr(self.attrs, '%s_variety' % key) - if variety == 'multi': + if variety == u'multi': return self.read_multi_index(key) - elif variety == 'block': + elif variety == u'block': return self.read_block_index(key) - elif variety == 'sparseint': + elif variety == u'sparseint': return self.read_sparse_intindex(key) - elif variety == 'regular': + elif variety == u'regular': _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover @@ -1979,7 +1987,7 @@ def read(self, **kwargs): sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') return SparseSeries(sp_values, index=index, sparse_index=sp_index, - kind=self.kind or 'block', fill_value=self.fill_value, + kind=self.kind or u'block', fill_value=self.fill_value, name=self.name) def write(self, obj, **kwargs): @@ -2167,6 +2175,7 @@ def __init__(self, *args, **kwargs): self.data_columns = [] self.info = dict() self.nan_rep = None + self.encoding = None self.selection = None @property @@ -2227,7 +2236,7 @@ def nrows_expected(self): @property def is_exists(self): """ has this table been created """ - return 'table' in self.group + return u'table' in self.group @property def storable(self): @@ -2293,6 +2302,7 @@ def set_attrs(self): self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding self.attrs.levels = self.levels self.set_info() @@ -2302,6 +2312,7 @@ def get_attrs(self): self.data_columns = getattr(self.attrs,'data_columns',None) or [] self.info = getattr(self.attrs,'info',None) or dict() self.nan_rep = getattr(self.attrs,'nan_rep',None) + self.encoding = getattr(self.attrs,'encoding',None) self.levels = getattr(self.attrs,'levels',None) or [] t = self.table self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] @@ -2432,7 +2443,7 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: a.set_info(self.info) - a.convert(values, nan_rep=self.nan_rep) + a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) return True @@ -2464,7 +2475,7 @@ def validate_data_columns(self, data_columns, min_itemsize): # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, encoding=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2475,6 +2486,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, validate: validate the obj against an existiing object already written min_itemsize: a dict of the min size for a column in bytes nan_rep : a values to use for string column nan_rep + encoding : the encoding for string values data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns) """ @@ -2497,6 +2509,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, axes = [ a.axis for a in existing_table.index_axes] data_columns = existing_table.data_columns nan_rep = existing_table.nan_rep + encoding = existing_table.encoding self.info = copy.copy(existing_table.info) else: existing_table = None @@ -2509,9 +2522,16 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes = [] self.data_columns = [] + # encoding + if encoding is None: + if py3compat.PY3: + encoding = _default_encoding + self.encoding = encoding + # nan_representation if nan_rep is None: nan_rep = 'nan' + self.nan_rep = nan_rep # create axes to index and non_index @@ -2521,7 +2541,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if i in axes: name = obj._AXIS_NAMES[i] index_axes_map[i] = _convert_index( - a).set_name(name).set_axis(i) + a, self.encoding).set_name(name).set_axis(i) else: # we might be able to change the axes on the appending data if @@ -2597,6 +2617,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, + encoding=encoding, info=self.info, **kwargs) col.set_pos(j) @@ -2718,7 +2739,7 @@ def read_column(self, column, where = None, **kwargs): # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data()) + return Series(a.convert(c[:], nan_rep=self.nan_rep, encoding=self.encoding).take_data()) raise KeyError("column [%s] not found in the table" % column) @@ -2863,14 +2884,15 @@ class AppendableTable(LegacyTable): def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, - expectedrows=None, **kwargs): + expectedrows=None, encoding=None, **kwargs): if not append and self.is_exists: self._handle.removeNode(self.group, 'table') # create the axes self.create_axes(axes=axes, obj=obj, validate=append, - min_itemsize=min_itemsize, **kwargs) + min_itemsize=min_itemsize, encoding=encoding, + **kwargs) if not self.is_exists: @@ -3173,7 +3195,7 @@ class AppendableNDimTable(AppendablePanelTable): ndim = 4 obj_type = Panel4D -def _convert_index(index): +def _convert_index(index, encoding=None): index_name = getattr(index,'name',None) if isinstance(index, DatetimeIndex): @@ -3213,7 +3235,7 @@ def _convert_index(index): # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom - converted = np.array(list(values), dtype=np.str_) + converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, index_name=index_name) @@ -3236,47 +3258,77 @@ def _convert_index(index): index_name=index_name) def _unconvert_index(data, kind): - if kind == 'datetime64': + if kind == u'datetime64': index = DatetimeIndex(data) - elif kind == 'datetime': + elif kind == u'datetime': index = np.array([datetime.fromtimestamp(v) for v in data], dtype=object) - elif kind == 'date': + elif kind == u'date': index = np.array([date.fromtimestamp(v) for v in data], dtype=object) - elif kind in ('string', 'integer', 'float'): + elif kind in (u'string', u'integer', u'float'): index = np.array(data) - elif kind == 'object': + elif kind == u'object': index = np.array(data[0]) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index def _unconvert_index_legacy(data, kind, legacy=False): - if kind == 'datetime': + if kind == u'datetime': index = lib.time64_to_datetime(data) - elif kind in ('string', 'integer'): + elif kind in (u'string', u'integer'): index = np.array(data, dtype=object) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index -def _maybe_convert(values, val_kind): +def _convert_string_array(data, encoding, itemsize=None): + + # encode if needed + if encoding is not None: + f = np.vectorize(lambda x: x.encode(encoding)) + data = f(data) + + # create the sized dtype + if itemsize is None: + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + + data = np.array(data,dtype="S%d" % itemsize) + return data + +def _unconvert_string_array(data, nan_rep=None, encoding=None): + """ deserialize a string array, possibly decoding """ + shape = data.shape + data = np.array(data.ravel(),dtype=object) + if encoding is not None: + f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + data = f(data) + + if nan_rep is None: + nan_rep = 'nan' + + data = lib.string_array_replace_from_nan_rep(data, nan_rep) + return data.reshape(shape) + +def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): - conv = _get_converter(val_kind) + conv = _get_converter(val_kind, encoding) # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind): +def _get_converter(kind, encoding): if kind == 'datetime64': return lambda x: np.array(x, dtype='M8[ns]') - if kind == 'datetime': + elif kind == 'datetime': return lib.convert_timestamps + elif kind == 'string': + return lambda x: _unconvert_string_array(x,encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) def _need_convert(kind): - if kind in ('datetime', 'datetime64'): + if kind in ('datetime', 'datetime64', 'string'): return True return False @@ -3290,6 +3342,7 @@ class Term(object): >, >=, <, <=, =, != (not equal) are allowed value : a value or list of values (required) queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable + encoding : an encoding that will encode the query terms Returns ------- @@ -3303,14 +3356,14 @@ class Term(object): >>> Term('index', ['20121114','20121114']) >>> Term('index', datetime(2012,11,14)) >>> Term('major_axis>20121114') - >>> Term('minor_axis', ['A','B']) + >>> Term('minor_axis', ['A','U']) """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None): + def __init__(self, field, op=None, value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None @@ -3318,6 +3371,12 @@ def __init__(self, field, op=None, value=None, queryables=None): self.filter = None self.condition = None + if py3compat.PY3: + if encoding is None: + encoding = _default_encoding + + self.encoding = encoding + # unpack lists/tuples in field while(isinstance(field, (tuple, list))): f = field @@ -3446,7 +3505,7 @@ def eval(self): self.condition = '(%s %s %s)' % ( self.field, self.op, values[0][0]) - + else: raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) @@ -3454,32 +3513,39 @@ def eval(self): def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ - if self.kind == 'datetime64' or self.kind == 'datetime' : + def stringify(value): + value = str(value) + if self.encoding is not None: + value = value.encode(self.encoding) + return value + + kind = self.kind + if kind == u'datetime64' or kind == u'datetime' : v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') return [v.value, v] - elif isinstance(v, datetime) or hasattr(v, 'timetuple') or self.kind == 'date': + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) return [v, Timestamp(v) ] - elif self.kind == 'integer': + elif kind == u'integer': v = int(float(v)) return [v, v] - elif self.kind == 'float': + elif kind == u'float': v = float(v) return [v, v] - elif self.kind == 'bool': + elif kind == u'bool': if isinstance(v, basestring): - v = not str(v).strip().lower() in ["false", "f", "no", "n", "none", "0", "[]", "{}", ""] + v = not stringify(v).strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) return [v, v] elif not isinstance(v, basestring): - v = str(v) + v = stringify(v) return [v, v] # string quoting - return ["'" + v + "'", v] + return [stringify("'" + v + "'"), stringify(v)] class Coordinates(object): @@ -3533,6 +3599,8 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): conds = [t.condition for t in self.terms if t.condition is not None] if len(conds): self.condition = "(%s)" % ' & '.join(conds) + #if self.table.encoding is not None: + # self.condition = self.condition.encode(self.table.encoding) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3555,7 +3623,7 @@ def generate(self, where): where = [where] queryables = self.table.queryables() - return [Term(c, queryables=queryables) for c in where] + return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d0f03774f2070..7a7bca02b1cd2 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -474,6 +474,20 @@ def test_append(self): store.append('uints', uint_data, data_columns=['u08','u16','u32']) # 64-bit indices not yet supported tm.assert_frame_equal(store['uints'], uint_data) + def test_encoding(self): + + with ensure_clean(self.path) as store: + df = DataFrame(dict(A='foo',B='bar'),index=range(5)) + df.loc[2,'A'] = np.nan + df.loc[3,'B'] = np.nan + _maybe_remove(store, 'df') + store.append('df', df, encoding='ascii') + tm.assert_frame_equal(store['df'], df) + + expected = df.reindex(columns=['A']) + result = store.select('df',Term('columns=A',encoding='ascii')) + tm.assert_frame_equal(result,expected) + def test_append_some_nans(self): with ensure_clean(self.path) as store: @@ -556,6 +570,7 @@ def test_append_some_nans(self): def test_append_frame_column_oriented(self): with ensure_clean(self.path) as store: + import pdb; pdb.set_trace() # column oriented df = tm.makeTimeDataFrame() _maybe_remove(store, 'df1') diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 15791a984ecc5..a80ad5b7d0208 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -14,6 +14,7 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, Py_INCREF, PyTuple_SET_ITEM, PyList_Check, PyFloat_Check, PyString_Check, + PyBytes_Check, PyTuple_SetItem, PyTuple_New, PyObject_SetAttrString) @@ -762,7 +763,7 @@ def max_len_string_array(ndarray[object, ndim=1] arr): m = 0 for i from 0 <= i < length: v = arr[i] - if PyString_Check(v): + if PyString_Check(v) or PyBytes_Check(v): l = len(v) if l > m: @@ -772,11 +773,10 @@ def max_len_string_array(ndarray[object, ndim=1] arr): @cython.boundscheck(False) @cython.wraparound(False) -def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): +def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): """ replace the values in the array with replacement if they are nan_rep; return the same array """ - cdef int length = arr.shape[0] - cdef int i = 0 + cdef int length = arr.shape[0], i = 0 if replace is None: replace = np.nan @@ -788,7 +788,6 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.boundscheck(False) @cython.wraparound(False) - def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): cdef int N, j, i, ncols From 8bbfb2e3e18f59860d845c48d6fdf29d22f42877 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 4 Jun 2013 22:04:28 -0400 Subject: [PATCH 4/7] ENH: added TermValue, and do readWhere with condvars --- pandas/io/pytables.py | 248 ++++++++++++++++++------------- pandas/io/tests/test_pytables.py | 6 +- 2 files changed, 152 insertions(+), 102 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 87590fe65b5bb..b36985d65569a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -19,7 +19,7 @@ from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex -from pandas.core.common import adjoin, isnull +from pandas.core.common import adjoin, isnull, is_list_like from pandas.core.algorithms import match, unique, factorize from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort @@ -42,6 +42,18 @@ # PY3 encoding if we don't specify _default_encoding = 'UTF-8' +def _ensure_decoded(s): + """ if we have bytes, decode them to unicde """ + if isinstance(s, np.bytes_): + s = s.decode('UTF-8') + return s +def _ensure_encoding(encoding): + # set the encoding if we need + if encoding is None: + if py3compat.PY3: + encoding = _default_encoding + return encoding + class IncompatibilityWarning(Warning): pass incompatibility_doc = """ where criteria is being ignored as this version [%s] is too old (or not-defined), @@ -768,8 +780,8 @@ def error(t): raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % (t,group,type(value),table,append,kwargs)) - pt = getattr(group._v_attrs,'pandas_type',None) - tt = getattr(group._v_attrs,'table_type',None) + pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) + tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) # infer the pt from the passed value if pt is None: @@ -833,7 +845,7 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, complib=None, **kwargs): + def _write_to_group(self, key, value, index=True, table=False, append=False, complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -858,7 +870,7 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, com group = self._handle.createGroup(path, p) path = new_path - s = self._create_storer(group, value, table=table, append=append, **kwargs) + s = self._create_storer(group, value, table=table, append=append, encoding=encoding, **kwargs) if append: # raise if we are trying to append to a non-table, # or a table that exists (and we are putting) @@ -1075,7 +1087,7 @@ def __iter__(self): def maybe_set_size(self, min_itemsize=None, **kwargs): """ maybe set a string col itemsize: min_itemsize can be an interger or a dict with this columns name with an integer size """ - if self.kind == u'string': + if _ensure_decoded(self.kind) == u'string': if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -1095,7 +1107,7 @@ def validate_col(self, itemsize=None): # validate this column for string truncation (or reset to the max size) dtype = getattr(self, 'dtype', None) - if self.kind == u'string': + if _ensure_decoded(self.kind) == u'string': c = self.col if c is not None: @@ -1225,7 +1237,7 @@ def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, bloc super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = "%s_dtype" % self.name + self.dtype_attr = u"%s_dtype" % self.name self.set_data(data) def __repr__(self): @@ -1253,7 +1265,7 @@ def take_data(self): def set_kind(self): # set my kind if we can if self.dtype is not None: - dtype = self.dtype + dtype = _ensure_decoded(self.dtype) if dtype.startswith(u'string') or dtype.startswith(u'bytes'): self.kind = 'string' elif dtype.startswith(u'float'): @@ -1427,9 +1439,10 @@ def convert(self, values, nan_rep, encoding): # convert to the correct dtype if self.dtype is not None: + dtype = _ensure_decoded(self.dtype) # reverse converts - if self.dtype == u'datetime64': + if dtype == u'datetime64': # recreate the timezone if self.tz is not None: @@ -1442,22 +1455,22 @@ def convert(self, values, nan_rep, encoding): else: self.data = np.asarray(self.data, dtype='M8[ns]') - elif self.dtype == u'date': + elif dtype == u'date': self.data = np.array( [date.fromtimestamp(v) for v in self.data], dtype=object) - elif self.dtype == u'datetime': + elif dtype == u'datetime': self.data = np.array( [datetime.fromtimestamp(v) for v in self.data], dtype=object) else: try: - self.data = self.data.astype(self.dtype) + self.data = self.data.astype(dtype) except: self.data = self.data.astype('O') # convert nans / decode - if self.kind == u'string': + if _ensure_decoded(self.kind) == u'string': self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) return self @@ -1481,7 +1494,7 @@ class DataIndexableCol(DataCol): @property def is_searchable(self): - return self.kind == u'string' + return _ensure_decoded(self.kind) == u'string' def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize) @@ -1514,9 +1527,10 @@ class Storer(object): ndim = None is_table = False - def __init__(self, parent, group, **kwargs): + def __init__(self, parent, group, encoding=None, **kwargs): self.parent = parent self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1525,7 +1539,7 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = getattr(self.group._v_attrs,'pandas_version',None) + version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1535,7 +1549,7 @@ def set_version(self): @property def pandas_type(self): - return getattr(self.group._v_attrs, 'pandas_type', None) + return _ensure_decoded(getattr(self.group._v_attrs, 'pandas_type', None)) def __repr__(self): """ return a pretty representatgion of myself """ @@ -1684,11 +1698,19 @@ def validate_read(self, kwargs): def is_exists(self): return True + def set_attrs(self): + """ set our object attributes """ + self.attrs.encoding = self.encoding + def get_attrs(self): """ retrieve our attributes """ + self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) for n in self.attributes: setattr(self,n,getattr(self.attrs, n, None)) + def write(self, obj, **kwargs): + self.set_attrs() + def read_array(self, key): """ read an array for the specified node (off of group """ import tables @@ -1719,7 +1741,7 @@ def read_array(self, key): return ret def read_index(self, key): - variety = getattr(self.attrs, '%s_variety' % key) + variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) if variety == u'multi': return self.read_multi_index(key) @@ -1745,7 +1767,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index).set_name('index') + converted = _convert_index(index,self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1792,7 +1814,7 @@ def write_multi_index(self, key, index): index.names)): # write the level level_key = '%s_level%d' % (key, i) - conv_level = _convert_index(lev).set_name(level_key) + conv_level = _convert_index(lev, self.encoding).set_name(level_key) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind @@ -1843,16 +1865,15 @@ def read_index_node(self, node): kwargs['tz'] = node._v_attrs['tz'] if kind in ('date', 'datetime'): - index = factory(_unconvert_index(data, kind), dtype=object, + index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, **kwargs) else: - index = factory(_unconvert_index(data, kind), **kwargs) + index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) index.name = name return name, index - def write_array_empty(self, key, value): """ write a 0-len array """ @@ -1932,7 +1953,7 @@ def read_index_legacy(self, key): node = getattr(self.group,key) data = node[:] kind = node._v_attrs.kind - return _unconvert_index_legacy(data, kind) + return _unconvert_index_legacy(data, kind, encoding=self.encoding) class LegacySeriesStorer(LegacyStorer): @@ -1952,7 +1973,7 @@ def read(self, **kwargs): return DataFrame(values, index=index, columns=columns) class SeriesStorer(GenericStorer): - pandas_kind = 'series' + pandas_kind = u'series' attributes = ['name'] @property @@ -1973,12 +1994,13 @@ def read(self, **kwargs): return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): + super(SeriesStorer, self).write(obj, **kwargs) self.write_index('index', obj.index) self.write_array('values', obj.values) self.attrs.name = obj.name class SparseSeriesStorer(GenericStorer): - pandas_kind = 'sparse_series' + pandas_kind = u'sparse_series' attributes = ['name','fill_value','kind'] def read(self, **kwargs): @@ -1991,6 +2013,7 @@ def read(self, **kwargs): name=self.name) def write(self, obj, **kwargs): + super(SparseSeriesStorer, self).write(obj, **kwargs) self.write_index('index', obj.index) self.write_index('sp_index', obj.sp_index) self.write_array('sp_values', obj.sp_values) @@ -1999,7 +2022,7 @@ def write(self, obj, **kwargs): self.attrs.kind = obj.kind class SparseFrameStorer(GenericStorer): - pandas_kind = 'sparse_frame' + pandas_kind = u'sparse_frame' attributes = ['default_kind','default_fill_value'] def read(self, **kwargs): @@ -2017,6 +2040,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): """ write it as a collection of individual sparse series """ + super(SparseFrameStorer, self).write(obj, **kwargs) for name, ss in obj.iteritems(): key = 'sparse_series_%s' % name if key not in self.group._v_children: @@ -2030,7 +2054,7 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) class SparsePanelStorer(GenericStorer): - pandas_kind = 'sparse_panel' + pandas_kind = u'sparse_panel' attributes = ['default_kind','default_fill_value'] def read(self, **kwargs): @@ -2048,6 +2072,7 @@ def read(self, **kwargs): default_fill_value=self.default_fill_value) def write(self, obj, **kwargs): + super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) @@ -2115,6 +2140,7 @@ def read(self, **kwargs): return self.obj_type(BlockManager(blocks, axes)) def write(self, obj, **kwargs): + super(BlockManagerStorer, self).write(obj, **kwargs) data = obj._data if not data.is_consolidated(): data = data.consolidate() @@ -2132,11 +2158,11 @@ def write(self, obj, **kwargs): self.write_index('block%d_items' % i, blk.items) class FrameStorer(BlockManagerStorer): - pandas_kind = 'frame' + pandas_kind = u'frame' obj_type = DataFrame class PanelStorer(BlockManagerStorer): - pandas_kind = 'wide' + pandas_kind = u'wide' obj_type = Panel is_shape_reversed = True @@ -2161,7 +2187,7 @@ class Table(Storer): levels : the names of levels """ - pandas_kind = 'wide_table' + pandas_kind = u'wide_table' table_type = None levels = 1 is_table = True @@ -2175,7 +2201,6 @@ def __init__(self, *args, **kwargs): self.data_columns = [] self.info = dict() self.nan_rep = None - self.encoding = None self.selection = None @property @@ -2312,7 +2337,7 @@ def get_attrs(self): self.data_columns = getattr(self.attrs,'data_columns',None) or [] self.info = getattr(self.attrs,'info',None) or dict() self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.encoding = getattr(self.attrs,'encoding',None) + self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) self.levels = getattr(self.attrs,'levels',None) or [] t = self.table self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] @@ -2475,7 +2500,7 @@ def validate_data_columns(self, data_columns, min_itemsize): # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, encoding=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2506,11 +2531,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep - encoding = existing_table.encoding - self.info = copy.copy(existing_table.info) + axes = [ a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep + self.encoding = existing_table.encoding + self.info = copy.copy(existing_table.info) else: existing_table = None @@ -2522,12 +2547,6 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes = [] self.data_columns = [] - # encoding - if encoding is None: - if py3compat.PY3: - encoding = _default_encoding - self.encoding = encoding - # nan_representation if nan_rep is None: nan_rep = 'nan' @@ -2617,7 +2636,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, - encoding=encoding, + encoding=self.encoding, info=self.info, **kwargs) col.set_pos(j) @@ -2748,7 +2767,7 @@ class WORMTable(Table): table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk """ - table_type = 'worm' + table_type = u'worm' def read(self, **kwargs): """ read the indicies and the indexing array, calculate offset rows and @@ -2773,7 +2792,7 @@ class LegacyTable(Table): IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), DataCol(name='fields', cname='values', kind_attr='fields', pos=2)] - table_type = 'legacy' + table_type = u'legacy' ndim = 3 def write(self, **kwargs): @@ -2863,8 +2882,8 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): """ support the legacy frame table """ - pandas_kind = 'frame_table' - table_type = 'legacy_frame' + pandas_kind = u'frame_table' + table_type = u'legacy_frame' obj_type = Panel def read(self, *args, **kwargs): @@ -2873,25 +2892,25 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): """ support the legacy panel table """ - table_type = 'legacy_panel' + table_type = u'legacy_panel' obj_type = Panel class AppendableTable(LegacyTable): """ suppor the new appendable table formats """ _indexables = None - table_type = 'appendable' + table_type = u'appendable' def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, - expectedrows=None, encoding=None, **kwargs): + expectedrows=None, **kwargs): if not append and self.is_exists: self._handle.removeNode(self.group, 'table') # create the axes self.create_axes(axes=axes, obj=obj, validate=append, - min_itemsize=min_itemsize, encoding=encoding, + min_itemsize=min_itemsize, **kwargs) if not self.is_exists: @@ -3043,8 +3062,8 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): """ suppor the new appendable table formats """ - pandas_kind = 'frame_table' - table_type = 'appendable_frame' + pandas_kind = u'frame_table' + table_type = u'appendable_frame' ndim = 2 obj_type = DataFrame @@ -3098,8 +3117,8 @@ def read(self, where=None, columns=None, **kwargs): class GenericTable(AppendableFrameTable): """ a table that read/writes the generic pytables table format """ - pandas_kind = 'frame_table' - table_type = 'generic_table' + pandas_kind = u'frame_table' + table_type = u'generic_table' ndim = 2 obj_type = DataFrame @@ -3143,13 +3162,13 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): """ a frame with a multi-index """ - table_type = 'appendable_multiframe' + table_type = u'appendable_multiframe' obj_type = DataFrame ndim = 2 @property def table_type_short(self): - return 'appendable_multi' + return u'appendable_multi' def write(self, obj, data_columns=None, **kwargs): if data_columns is None: @@ -3174,7 +3193,7 @@ def read(self, columns=None, **kwargs): class AppendablePanelTable(AppendableTable): """ suppor the new appendable table formats """ - table_type = 'appendable_panel' + table_type = u'appendable_panel' ndim = 3 obj_type = Panel @@ -3191,7 +3210,7 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): """ suppor the new appendable table formats """ - table_type = 'appendable_ndim' + table_type = u'appendable_ndim' ndim = 4 obj_type = Panel4D @@ -3257,7 +3276,8 @@ def _convert_index(index, encoding=None): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) -def _unconvert_index(data, kind): +def _unconvert_index(data, kind, encoding=None): + kind = _ensure_decoded(kind) if kind == u'datetime64': index = DatetimeIndex(data) elif kind == u'datetime': @@ -3265,19 +3285,24 @@ def _unconvert_index(data, kind): dtype=object) elif kind == u'date': index = np.array([date.fromtimestamp(v) for v in data], dtype=object) - elif kind in (u'string', u'integer', u'float'): + elif kind in (u'integer', u'float'): index = np.array(data) + elif kind in (u'string'): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) elif kind == u'object': index = np.array(data[0]) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index -def _unconvert_index_legacy(data, kind, legacy=False): +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): + kind = _ensure_decoded(kind) if kind == u'datetime': index = lib.time64_to_datetime(data) - elif kind in (u'string', u'integer'): + elif kind in (u'integer'): index = np.array(data, dtype=object) + elif kind in (u'string'): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index @@ -3300,6 +3325,10 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape data = np.array(data.ravel(),dtype=object) + + # guard against a None encoding in PY3 (because of a legacy + # where the passed encoding is actually None) + encoding = _ensure_encoding(encoding) if encoding is not None: f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) data = f(data) @@ -3318,6 +3347,7 @@ def _maybe_convert(values, val_kind, encoding): return values def _get_converter(kind, encoding): + kind = _ensure_decoded(kind) if kind == 'datetime64': return lambda x: np.array(x, dtype='M8[ns]') elif kind == 'datetime': @@ -3343,7 +3373,8 @@ class Term(object): value : a value or list of values (required) queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable encoding : an encoding that will encode the query terms - + i : my term id number + Returns ------- a Term object @@ -3363,18 +3394,18 @@ class Term(object): _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None, encoding=None): + def __init__(self, field, op=None, value=None, queryables=None, i=None, encoding=None): self.field = None self.op = None self.value = None self.q = queryables or dict() self.filter = None - self.condition = None - - if py3compat.PY3: - if encoding is None: - encoding = _default_encoding + if i is None: + i = 0 + self.i = i + self.condition = None + self.condvars = dict() self.encoding = encoding # unpack lists/tuples in field @@ -3427,7 +3458,7 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): if self.field is None or self.op is None or self.value is None: raise ValueError("Could not create this term [%s]" % str(self)) - # = vs == + # = vs == if self.op == '=': self.op = '==' @@ -3436,7 +3467,7 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): if hasattr(self.value, '__iter__') and len(self.value) > 1: raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) - if not hasattr(self.value, '__iter__'): + if not is_list_like(self.value): self.value = [self.value] if len(self.q): @@ -3462,6 +3493,16 @@ def kind(self): """ the kind of my field """ return self.q.get(self.field) + def generate(self, v, i=None): + """ create and return the op string for this TermValue + add the variable to condvars """ + if i is None: + i = 0 + + cv = "_%s_%s_%s" % (self.field,self.i,i) + self.condvars[cv] = v.converted + return "(%s %s %s)" % (self.field, self.op, cv) + def eval(self): """ set the numexpr expression for this term """ @@ -3472,39 +3513,38 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [[v, v] for v in self.value] + values = [TermValue(v,v,self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: # our filter op expression if self.op == '!=': - filter_op = lambda axis, values: not axis.isin(values) + filter_op = lambda axis, vals: not axis.isin(vals) else: - filter_op = lambda axis, values: axis.isin(values) + filter_op = lambda axis, vals: axis.isin(vals) if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - self.condition = "(%s)" % ' | '.join( - ["(%s %s %s)" % (self.field, self.op, v[0]) for v in values]) + vs = [ self.generate(v, i) for i, v in enumerate(values) ] + self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v[1] for v in values])) + self.filter = (self.field, filter_op, Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v[1] for v in values])) + self.filter = (self.field, filter_op, Index([v.value for v in values])) else: if self.is_in_table: - self.condition = '(%s %s %s)' % ( - self.field, self.op, values[0][0]) + self.condition = self.generate(values[0]) else: @@ -3524,29 +3564,36 @@ def stringify(value): v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return [v.value, v] + return TermValue(v,v.value,kind) elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) - return [v, Timestamp(v) ] + return TermValue(v,Timestamp(v),kind) elif kind == u'integer': v = int(float(v)) - return [v, v] + return TermValue(v,v,kind) elif kind == u'float': v = float(v) - return [v, v] + return TermValue(v,v,kind) elif kind == u'bool': if isinstance(v, basestring): v = not stringify(v).strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) - return [v, v] + return TermValue(v,v,kind) elif not isinstance(v, basestring): v = stringify(v) - return [v, v] + return TermValue(v,stringify(v),u'string') # string quoting - return [stringify("'" + v + "'"), stringify(v)] + return TermValue(v,stringify(v),u'string') +class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind class Coordinates(object): """ holds a returned coordinates list, useful to select the same rows from different tables @@ -3585,6 +3632,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.start = start self.stop = stop self.condition = None + self.condvars = dict() self.filter = None self.terms = None self.coordinates = None @@ -3596,11 +3644,11 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - conds = [t.condition for t in self.terms if t.condition is not None] - if len(conds): - self.condition = "(%s)" % ' & '.join(conds) - #if self.table.encoding is not None: - # self.condition = self.condition.encode(self.table.encoding) + terms = [ t for t in self.terms if t.condition is not None ] + if len(terms): + self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) + for t in terms: + self.condvars.update(t.condvars) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3623,14 +3671,14 @@ def generate(self, where): where = [where] queryables = self.table.queryables() - return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] + return [Term(c, queryables=queryables, i=i, encoding=self.table.encoding) for i, c in enumerate(where)] def select(self): """ generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition, condvars=self.condvars, start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -3642,7 +3690,7 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition, condvars=self.condvars, start=self.start, stop=self.stop, sort=True) ### utilities ### diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7a7bca02b1cd2..edaf905ce7b75 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -115,7 +115,7 @@ def roundtrip(key, obj,**kwargs): o = tm.makeTimeSeries() assert_series_equal(o, roundtrip('series',o)) - + o = tm.makeStringSeries() assert_series_equal(o, roundtrip('string_series',o)) @@ -570,7 +570,7 @@ def test_append_some_nans(self): def test_append_frame_column_oriented(self): with ensure_clean(self.path) as store: - import pdb; pdb.set_trace() + # column oriented df = tm.makeTimeDataFrame() _maybe_remove(store, 'df1') @@ -2560,6 +2560,7 @@ def test_legacy_0_10_read(self): # legacy from 0.10 try: store = HDFStore(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), 'r') + str(store) for k in store.keys(): store.select(k) finally: @@ -2569,6 +2570,7 @@ def test_legacy_0_11_read(self): # legacy from 0.11 try: store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + str(store) df = store.select('df') df1 = store.select('df1') mi = store.select('mi') From cd7b115d53df8cb43e2cc76b62e58cd101114344 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 5 Jun 2013 13:23:14 -0400 Subject: [PATCH 5/7] TST: fixed do_copy testing BUG: more encoding/decoding issues --- pandas/io/pytables.py | 33 ++++++++++++++++---------------- pandas/io/tests/test_pytables.py | 24 +++++++++++++++-------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b36985d65569a..86edb7a43ec47 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -765,9 +765,9 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k,data, index=index, data_columns=getattr(s,'data_columns',None)) + new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) else: - new_store.put(k,data) + new_store.put(k, data, encoding=s.encoding) return new_store @@ -1043,13 +1043,13 @@ def convert(self, values, nan_rep, encoding): kwargs = dict() if self.freq is not None: - kwargs['freq'] = self.freq + kwargs['freq'] = _ensure_decoded(self.freq) if self.tz is not None: - kwargs['tz'] = self.tz + kwargs['tz'] = _ensure_decoded(self.tz) if self.index_name is not None: - kwargs['name'] = self.index_name + kwargs['name'] = _ensure_decoded(self.index_name) try: - self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) + self.values = Index(_maybe_convert(values, self.kind, self.encoding), **kwargs) except: # if the output freq is different that what we recorded, then infer it @@ -1706,7 +1706,7 @@ def get_attrs(self): """ retrieve our attributes """ self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) for n in self.attributes: - setattr(self,n,getattr(self.attrs, n, None)) + setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs): self.set_attrs() @@ -1847,7 +1847,7 @@ def read_multi_index(self, key): def read_index_node(self, node): data = node[:] - kind = node._v_attrs.kind + kind = _ensure_decoded(node._v_attrs.kind) name = None if 'name' in node._v_attrs: @@ -1858,13 +1858,13 @@ def read_index_node(self, node): factory = self._get_index_factory(index_class) kwargs = {} - if 'freq' in node._v_attrs: + if u'freq' in node._v_attrs: kwargs['freq'] = node._v_attrs['freq'] - if 'tz' in node._v_attrs: + if u'tz' in node._v_attrs: kwargs['tz'] = node._v_attrs['tz'] - if kind in ('date', 'datetime'): + if kind in (u'date', u'datetime'): index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, **kwargs) else: @@ -2077,7 +2077,7 @@ def write(self, obj, **kwargs): self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) - for name, sdf in obj.iteritems(): + for name, sdf in obj.iterkv(): key = 'sparse_frame_%s' % name if key not in self.group._v_children: node = self._handle.createGroup(self.group, key) @@ -3358,7 +3358,8 @@ def _get_converter(kind, encoding): raise ValueError('invalid kind %s' % kind) def _need_convert(kind): - if kind in ('datetime', 'datetime64', 'string'): + kind = _ensure_decoded(kind) + if kind in (u'datetime', u'datetime64', u'string'): return True return False @@ -3464,7 +3465,7 @@ def __init__(self, field, op=None, value=None, queryables=None, i=None, encoding # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1: + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,basestring): raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) if not is_list_like(self.value): @@ -3559,7 +3560,7 @@ def stringify(value): value = value.encode(self.encoding) return value - kind = self.kind + kind = _ensure_decoded(self.kind) if kind == u'datetime64' or kind == u'datetime' : v = lib.Timestamp(v) if v.tz is not None: @@ -3576,7 +3577,7 @@ def stringify(value): return TermValue(v,v,kind) elif kind == u'bool': if isinstance(v, basestring): - v = not stringify(v).strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + v = not v.strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) return TermValue(v,v,kind) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index edaf905ce7b75..8b3d4a475d952 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -17,6 +17,7 @@ from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal from pandas import concat, Timestamp +from pandas.util import py3compat from numpy.testing.decorators import slow @@ -1276,8 +1277,14 @@ def test_unimplemented_dtypes_table_columns(self): with ensure_clean(self.path) as store: + l = [('date', datetime.date(2001, 1, 2))] + + # py3 ok for unicode + if not py3compat.PY3: + l.append(('unicode', u'\u03c3')) + ### currently not supported dtypes #### - for n, f in [('unicode', u'\u03c3'), ('date', datetime.date(2001, 1, 2))]: + for n, f in l: df = tm.makeDataFrame() df[n] = f self.assertRaises( @@ -2602,24 +2609,25 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): # check indicies & nrows for k in tstore.keys(): - if tstore.is_table(k): + if tstore.get_storer(k).is_table: new_t = tstore.get_storer(k) orig_t = store.get_storer(k) self.assert_(orig_t.nrows == new_t.nrows) - for a in orig_t.axes: - if a.is_indexed: - self.assert_(new_t[a.name].is_indexed == True) - except (Exception), detail: - pass + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + self.assert_(new_t[a.name].is_indexed == True) + finally: safe_close(store) safe_close(tstore) safe_remove(new_f) do_copy() - do_copy(keys = ['df']) + do_copy(keys = ['/a','/b','/df1_mixed']) do_copy(propindexes = False) # new table From 2f7f9bd162b018e56871e9f17308c82894c8e5ff Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 5 Jun 2013 14:09:07 -0400 Subject: [PATCH 6/7] DOC: release notes update --- RELEASE.rst | 27 +++------------------------ pandas/io/pytables.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index c05bb526ab715..3a347246be8dd 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -63,14 +63,7 @@ pandas 0.11.1 to append an index with a different name than the existing - support datelike columns with a timezone as data_columns (GH2852_) - table writing performance improvements. -<<<<<<< HEAD -<<<<<<< HEAD -======= - - support python3 (via ``PyTables 3.0.0``) ->>>>>>> 116ab91... DOC: docstring/release notes updates for py3k -======= - - support py3 (via ``PyTables 3.0.0``) ->>>>>>> ab16d43... ENH: partial py3k support + - support python3 (via ``PyTables 3.0.0``) (GH3750_) - Add modulo operator to Series, DataFrame - Add ``date`` method to DatetimeIndex - Simplified the API and added a describe method to Categorical @@ -87,29 +80,14 @@ pandas 0.11.1 **API Changes** -<<<<<<< HEAD -<<<<<<< HEAD - - When removing an object from a ``HDFStore``, ``remove(key)`` raises - ``KeyError`` if the key is not a valid store object. - - In an ``HDFStore``, raise a ``TypeError`` on passing ``where`` or ``columns`` - to select with a Storer; these are invalid parameters at this time -======= -======= ->>>>>>> ab16d43... ENH: partial py3k support - ``HDFStore`` - When removing an object, ``remove(key)`` raises ``KeyError`` if the key is not a valid store object. - raise a ``TypeError`` on passing ``where`` or ``columns`` to select with a Storer; these are invalid parameters at this time -<<<<<<< HEAD - can now specify an ``encoding`` option to ``append/put`` - to enable alternate encodings ->>>>>>> 116ab91... DOC: docstring/release notes updates for py3k -======= - - can now specify an ``encoding`` option to ``append`` and ``select`` - to enable alternate encodings ->>>>>>> ab16d43... ENH: partial py3k support + to enable alternate encodings (GH3750_) - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) - Added mangle_dupe_cols option to read_table/csv, allowing users @@ -315,6 +293,7 @@ pandas 0.11.1 .. _GH3740: https://github.com/pydata/pandas/issues/3740 .. _GH3748: https://github.com/pydata/pandas/issues/3748 .. _GH3741: https://github.com/pydata/pandas/issues/3741 +.. _GH3750: https://github.com/pydata/pandas/issues/3750 pandas 0.11.0 ============= diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 86edb7a43ec47..e772d95d04ee7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -218,7 +218,7 @@ class HDFStore(object): complevel : int, 1-9, default 0 If a complib is specified compression will be applied where possible - complib : {'zliu', 'bzip2', 'lzo', 'blosc', None}, default None + complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None If complevel is > 0 apply compression to objects written in the store wherever possible fletcher32 : bool, default False @@ -711,7 +711,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] + return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( + g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -731,7 +732,8 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, fletcher32 = False, overwrite = True): + def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, + fletcher32 = False, overwrite = True): """ copy the existing store to a new file, upgrading in place Parameters @@ -845,7 +847,8 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, complib=None, encoding=None, **kwargs): + def _write_to_group(self, key, value, index=True, table=False, append=False, + complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -870,7 +873,8 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, com group = self._handle.createGroup(path, p) path = new_path - s = self._create_storer(group, value, table=table, append=append, encoding=encoding, **kwargs) + s = self._create_storer(group, value, table=table, append=append, + encoding=encoding, **kwargs) if append: # raise if we are trying to append to a non-table, # or a table that exists (and we are putting) From fb25ac1e4573de4ffb61ea4ac6f5ab3ced000837 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 5 Jun 2013 19:21:47 -0400 Subject: [PATCH 7/7] BUG: fix numpy 1.6.1 issues; remove need for condvars and use literals in the numexpr expressions --- pandas/io/pytables.py | 47 +++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e772d95d04ee7..b1b7b80e5fd23 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3314,8 +3314,8 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): def _convert_string_array(data, encoding, itemsize=None): # encode if needed - if encoding is not None: - f = np.vectorize(lambda x: x.encode(encoding)) + if encoding is not None and len(data): + f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object]) data = f(data) # create the sized dtype @@ -3333,7 +3333,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) - if encoding is not None: + if encoding is not None and len(data): f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) data = f(data) @@ -3378,7 +3378,6 @@ class Term(object): value : a value or list of values (required) queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable encoding : an encoding that will encode the query terms - i : my term id number Returns ------- @@ -3399,18 +3398,13 @@ class Term(object): _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None, i=None, encoding=None): + def __init__(self, field, op=None, value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None self.q = queryables or dict() self.filter = None - - if i is None: - i = 0 - self.i = i self.condition = None - self.condvars = dict() self.encoding = encoding # unpack lists/tuples in field @@ -3498,15 +3492,10 @@ def kind(self): """ the kind of my field """ return self.q.get(self.field) - def generate(self, v, i=None): - """ create and return the op string for this TermValue - add the variable to condvars """ - if i is None: - i = 0 - - cv = "_%s_%s_%s" % (self.field,self.i,i) - self.condvars[cv] = v.converted - return "(%s %s %s)" % (self.field, self.op, cv) + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.field, self.op, val) def eval(self): """ set the numexpr expression for this term """ @@ -3534,7 +3523,7 @@ def eval(self): # too many values to create the expression? if len(values) <= self._max_selectors: - vs = [ self.generate(v, i) for i, v in enumerate(values) ] + vs = [ self.generate(v) for v in values ] self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading @@ -3600,6 +3589,15 @@ def __init__(self, value, converted, kind): self.converted = converted self.kind = kind + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u'string': + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted + class Coordinates(object): """ holds a returned coordinates list, useful to select the same rows from different tables @@ -3637,7 +3635,6 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.start = start self.stop = stop self.condition = None - self.condvars = dict() self.filter = None self.terms = None self.coordinates = None @@ -3652,8 +3649,6 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): terms = [ t for t in self.terms if t.condition is not None ] if len(terms): self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) - for t in terms: - self.condvars.update(t.condvars) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3676,14 +3671,14 @@ def generate(self, where): where = [where] queryables = self.table.queryables() - return [Term(c, queryables=queryables, i=i, encoding=self.table.encoding) for i, c in enumerate(where)] + return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] def select(self): """ generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition, condvars=self.condvars, start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -3695,7 +3690,7 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition, condvars=self.condvars, start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) ### utilities ###