diff --git a/README.rst b/README.rst index daea702476ebc..85868176722bd 100644 --- a/README.rst +++ b/README.rst @@ -85,7 +85,6 @@ Optional dependencies - `Cython `__: Only necessary to build development version. Version 0.17.1 or higher. - `SciPy `__: miscellaneous statistical functions - `PyTables `__: necessary for HDF5-based storage - - Not yet supported on python >= 3 - `matplotlib `__: for plotting - `statsmodels `__ - Needed for parts of :mod:`pandas.stats` diff --git a/RELEASE.rst b/RELEASE.rst index 12d2389a8a59b..3a347246be8dd 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -63,6 +63,7 @@ pandas 0.11.1 to append an index with a different name than the existing - support datelike columns with a timezone as data_columns (GH2852_) - table writing performance improvements. + - support python3 (via ``PyTables 3.0.0``) (GH3750_) - Add modulo operator to Series, DataFrame - Add ``date`` method to DatetimeIndex - Simplified the API and added a describe method to Categorical @@ -79,10 +80,14 @@ pandas 0.11.1 **API Changes** - - When removing an object from a ``HDFStore``, ``remove(key)`` raises - ``KeyError`` if the key is not a valid store object. - - In an ``HDFStore``, raise a ``TypeError`` on passing ``where`` or ``columns`` - to select with a Storer; these are invalid parameters at this time + - ``HDFStore`` + + - When removing an object, ``remove(key)`` raises + ``KeyError`` if the key is not a valid store object. + - raise a ``TypeError`` on passing ``where`` or ``columns`` + to select with a Storer; these are invalid parameters at this time + - can now specify an ``encoding`` option to ``append/put`` + to enable alternate encodings (GH3750_) - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) - Added mangle_dupe_cols option to read_table/csv, allowing users @@ -288,6 +293,7 @@ pandas 0.11.1 .. _GH3740: https://github.com/pydata/pandas/issues/3740 .. _GH3748: https://github.com/pydata/pandas/issues/3748 .. _GH3741: https://github.com/pydata/pandas/issues/3741 +.. _GH3750: https://github.com/pydata/pandas/issues/3750 pandas 0.11.0 ============= diff --git a/ci/install.sh b/ci/install.sh index a091834a9570f..b748070db85aa 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -69,13 +69,11 @@ if ( ! $VENV_FILE_AVAILABLE ); then pip install $PIP_ARGS cython if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then - # installed explicitly above, to get the library as well - # sudo apt-get $APT_ARGS install libhdf5-serial-dev; - pip install numexpr - pip install tables pip install $PIP_ARGS xlwt fi + pip install numexpr + pip install tables pip install $PIP_ARGS matplotlib pip install $PIP_ARGS openpyxl pip install $PIP_ARGS xlrd>=0.9.0 diff --git a/doc/source/install.rst b/doc/source/install.rst index 6868969c1b968..9dc8064da45e3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -95,7 +95,6 @@ Optional Dependencies version. Version 0.17.1 or higher. * `SciPy `__: miscellaneous statistical functions * `PyTables `__: necessary for HDF5-based storage - * Not yet supported on python >= 3 * `matplotlib `__: for plotting * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` diff --git a/doc/source/io.rst b/doc/source/io.rst index 802ab08e85932..1c615ca278668 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1300,12 +1300,11 @@ the high performance HDF5 format using the excellent `PyTables `__ library. See the :ref:`cookbook` for some advanced strategies -.. warning:: +.. note:: - ``PyTables`` 3.0.0 was recently released. This enables support for Python 3, - however, it has not been integrated into pandas as of yet. (Under Python 2, - ``PyTables`` version >= 2.3 is supported). - + ``PyTables`` 3.0.0 was recently released to enables support for Python 3. + Pandas should be fully compatible (and previously written stores should be + backwards compatible) with all ``PyTables`` >= 2.3 .. ipython:: python :suppress: diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index b2fee1acbc4d6..badb364d214d1 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -237,6 +237,9 @@ Enhancements pd.get_option('a.b') pd.get_option('b.c') + - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 + + Bug Fixes ~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5a480e08effba..b1b7b80e5fd23 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -13,12 +13,13 @@ import numpy as np from pandas import ( - Series, TimeSeries, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index + Series, TimeSeries, DataFrame, Panel, Panel4D, Index, + MultiIndex, Int64Index, Timestamp ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex -from pandas.core.common import adjoin, isnull +from pandas.core.common import adjoin, isnull, is_list_like from pandas.core.algorithms import match, unique, factorize from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort @@ -27,6 +28,7 @@ from pandas.core.index import Int64Index, _ensure_index import pandas.core.common as com from pandas.tools.merge import concat +from pandas.util import py3compat import pandas.lib as lib import pandas.algos as algos @@ -37,6 +39,21 @@ # versioning attribute _version = '0.10.1' +# PY3 encoding if we don't specify +_default_encoding = 'UTF-8' + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicde """ + if isinstance(s, np.bytes_): + s = s.decode('UTF-8') + return s +def _ensure_encoding(encoding): + # set the encoding if we need + if encoding is None: + if py3compat.PY3: + encoding = _default_encoding + return encoding + class IncompatibilityWarning(Warning): pass incompatibility_doc = """ where criteria is being ignored as this version [%s] is too old (or not-defined), @@ -56,40 +73,40 @@ class PerformanceWarning(Warning): pass # map object types _TYPE_MAP = { - Series : 'series', - SparseSeries : 'sparse_series', - TimeSeries : 'series', - DataFrame : 'frame', - SparseDataFrame : 'sparse_frame', - Panel : 'wide', - Panel4D : 'ndim', - SparsePanel : 'sparse_panel' + Series : u'series', + SparseSeries : u'sparse_series', + TimeSeries : u'series', + DataFrame : u'frame', + SparseDataFrame : u'sparse_frame', + Panel : u'wide', + Panel4D : u'ndim', + SparsePanel : u'sparse_panel' } # storer class map _STORER_MAP = { - 'TimeSeries' : 'LegacySeriesStorer', - 'Series' : 'LegacySeriesStorer', - 'DataFrame' : 'LegacyFrameStorer', - 'DataMatrix' : 'LegacyFrameStorer', - 'series' : 'SeriesStorer', - 'sparse_series' : 'SparseSeriesStorer', - 'frame' : 'FrameStorer', - 'sparse_frame' : 'SparseFrameStorer', - 'wide' : 'PanelStorer', - 'sparse_panel' : 'SparsePanelStorer', + u'TimeSeries' : 'LegacySeriesStorer', + u'Series' : 'LegacySeriesStorer', + u'DataFrame' : 'LegacyFrameStorer', + u'DataMatrix' : 'LegacyFrameStorer', + u'series' : 'SeriesStorer', + u'sparse_series' : 'SparseSeriesStorer', + u'frame' : 'FrameStorer', + u'sparse_frame' : 'SparseFrameStorer', + u'wide' : 'PanelStorer', + u'sparse_panel' : 'SparsePanelStorer', } # table class map _TABLE_MAP = { - 'generic_table' : 'GenericTable', - 'appendable_frame' : 'AppendableFrameTable', - 'appendable_multiframe' : 'AppendableMultiFrameTable', - 'appendable_panel' : 'AppendablePanelTable', - 'appendable_ndim' : 'AppendableNDimTable', - 'worm' : 'WORMTable', - 'legacy_frame' : 'LegacyFrameTable', - 'legacy_panel' : 'LegacyPanelTable', + u'generic_table' : 'GenericTable', + u'appendable_frame' : 'AppendableFrameTable', + u'appendable_multiframe' : 'AppendableMultiFrameTable', + u'appendable_panel' : 'AppendablePanelTable', + u'appendable_ndim' : 'AppendableNDimTable', + u'worm' : 'WORMTable', + u'legacy_frame' : 'LegacyFrameTable', + u'legacy_panel' : 'LegacyPanelTable', } # axes map @@ -522,15 +539,16 @@ def put(self, key, value, table=None, append=False, **kwargs): Parameters ---------- - key : object - value : {Series, DataFrame, Panel} - table : boolean, default False + key : object + value : {Series, DataFrame, Panel} + table : boolean, default False Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data - append : boolean, default False + append : boolean, default False For table data structures, append the input data to the existing table + encoding : default None, provide an encoding for strings """ self._write_to_group(key, value, table=table, append=append, **kwargs) @@ -595,6 +613,7 @@ def append(self, key, value, columns=None, **kwargs): nan_rep : string to use as string nan represenation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table + encoding : default None, provide an encoding for strings Notes ----- @@ -692,7 +711,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr(g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != 'table') ] + return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( + g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -712,7 +732,8 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, fletcher32 = False, overwrite = True): + def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, + fletcher32 = False, overwrite = True): """ copy the existing store to a new file, upgrading in place Parameters @@ -746,9 +767,9 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k,data, index=index, data_columns=getattr(s,'data_columns',None)) + new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) else: - new_store.put(k,data) + new_store.put(k, data, encoding=s.encoding) return new_store @@ -761,8 +782,8 @@ def error(t): raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % (t,group,type(value),table,append,kwargs)) - pt = getattr(group._v_attrs,'pandas_type',None) - tt = getattr(group._v_attrs,'table_type',None) + pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) + tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) # infer the pt from the passed value if pt is None: @@ -770,8 +791,8 @@ def error(t): _tables() if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): - pt = 'frame_table' - tt = 'generic_table' + pt = u'frame_table' + tt = u'generic_table' else: raise TypeError("cannot create a storer if the object is not existing nor a value are passed") else: @@ -783,10 +804,10 @@ def error(t): # we are actually a table if table or append: - pt += '_table' + pt += u'_table' # a storer node - if 'table' not in pt: + if u'table' not in pt: try: return globals()[_STORER_MAP[pt]](self, group, **kwargs) except: @@ -798,26 +819,26 @@ def error(t): # if we are a writer, determin the tt if value is not None: - if pt == 'frame_table': + if pt == u'frame_table': index = getattr(value,'index',None) if index is not None: if index.nlevels == 1: - tt = 'appendable_frame' + tt = u'appendable_frame' elif index.nlevels > 1: - tt = 'appendable_multiframe' - elif pt == 'wide_table': - tt = 'appendable_panel' - elif pt == 'ndim_table': - tt = 'appendable_ndim' + tt = u'appendable_multiframe' + elif pt == u'wide_table': + tt = u'appendable_panel' + elif pt == u'ndim_table': + tt = u'appendable_ndim' else: # distiguish between a frame/table - tt = 'legacy_panel' + tt = u'legacy_panel' try: fields = group.table._v_attrs.fields - if len(fields) == 1 and fields[0] == 'value': - tt = 'legacy_frame' + if len(fields) == 1 and fields[0] == u'value': + tt = u'legacy_frame' except: pass @@ -826,7 +847,8 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, complib=None, **kwargs): + def _write_to_group(self, key, value, index=True, table=False, append=False, + complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -851,7 +873,8 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, com group = self._handle.createGroup(path, p) path = new_path - s = self._create_storer(group, value, table=table, append=append, **kwargs) + s = self._create_storer(group, value, table=table, append=append, + encoding=encoding, **kwargs) if append: # raise if we are trying to append to a non-table, # or a table that exists (and we are putting) @@ -890,7 +913,7 @@ class TableIterator(object): def __init__(self, func, nrows, start=None, stop=None, chunksize=None): self.func = func - self.nrows = nrows + self.nrows = nrows or 0 self.start = start or 0 if stop is None: @@ -1015,7 +1038,7 @@ def infer(self, table): new_self.get_attr() return new_self - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the values from this selection: take = take ownership """ try: values = values[self.cname] @@ -1024,19 +1047,19 @@ def convert(self, values, nan_rep): kwargs = dict() if self.freq is not None: - kwargs['freq'] = self.freq + kwargs['freq'] = _ensure_decoded(self.freq) if self.tz is not None: - kwargs['tz'] = self.tz + kwargs['tz'] = _ensure_decoded(self.tz) if self.index_name is not None: - kwargs['name'] = self.index_name + kwargs['name'] = _ensure_decoded(self.index_name) try: - self.values = Index(_maybe_convert(values, self.kind), **kwargs) + self.values = Index(_maybe_convert(values, self.kind, self.encoding), **kwargs) except: # if the output freq is different that what we recorded, then infer it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind), **kwargs) + self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) return self def take_data(self): @@ -1068,7 +1091,7 @@ def __iter__(self): def maybe_set_size(self, min_itemsize=None, **kwargs): """ maybe set a string col itemsize: min_itemsize can be an interger or a dict with this columns name with an integer size """ - if self.kind == 'string': + if _ensure_decoded(self.kind) == u'string': if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) @@ -1088,7 +1111,7 @@ def validate_col(self, itemsize=None): # validate this column for string truncation (or reset to the max size) dtype = getattr(self, 'dtype', None) - if self.kind == 'string': + if _ensure_decoded(self.kind) == u'string': c = self.col if c is not None: @@ -1167,7 +1190,7 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the values from this selection: take = take ownership """ self.values = Int64Index(np.arange(self.table.nrows)) @@ -1218,7 +1241,7 @@ def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, bloc super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = "%s_dtype" % self.name + self.dtype_attr = u"%s_dtype" % self.name self.set_data(data) def __repr__(self): @@ -1246,22 +1269,25 @@ def take_data(self): def set_kind(self): # set my kind if we can if self.dtype is not None: - if self.dtype.startswith('string'): + dtype = _ensure_decoded(self.dtype) + if dtype.startswith(u'string') or dtype.startswith(u'bytes'): self.kind = 'string' - elif self.dtype.startswith('float'): + elif dtype.startswith(u'float'): self.kind = 'float' - elif self.dtype.startswith('int'): + elif dtype.startswith(u'int') or dtype.startswith(u'uint'): self.kind = 'integer' - elif self.dtype.startswith('date'): + elif dtype.startswith(u'date'): self.kind = 'datetime' - elif self.dtype.startswith('bool'): + elif dtype.startswith(u'bool'): self.kind = 'bool' + else: + raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) # set my typ if we need if self.typ is None: self.typ = getattr(self.description,self.cname,None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1304,7 +1330,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep) + self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) else: self.set_atom_data(block) @@ -1313,7 +1339,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): + def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep) data = block.values @@ -1334,7 +1360,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): # itemsize is the maximum length of a string (along any dimension) - itemsize = lib.max_len_string_array(data.ravel()) + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) # specified min_itemsize? if isinstance(min_itemsize, dict): @@ -1351,10 +1377,10 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep): self.itemsize = itemsize self.kind = 'string' self.typ = self.get_atom_string(block, itemsize) - self.set_data(self.convert_string_data(data, itemsize)) + self.set_data(self.convert_string_data(data, itemsize, encoding)) - def convert_string_data(self, data, itemsize): - return data.astype('S%s' % itemsize) + def convert_string_data(self, data, itemsize, encoding): + return _convert_string_array(data, encoding, itemsize) def get_atom_coltype(self): """ return the PyTables column class for this column """ @@ -1407,7 +1433,7 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing items dtype" " in table!") - def convert(self, values, nan_rep): + def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ try: values = values[self.cname] @@ -1417,9 +1443,10 @@ def convert(self, values, nan_rep): # convert to the correct dtype if self.dtype is not None: + dtype = _ensure_decoded(self.dtype) # reverse converts - if self.dtype == 'datetime64': + if dtype == u'datetime64': # recreate the timezone if self.tz is not None: @@ -1432,30 +1459,30 @@ def convert(self, values, nan_rep): else: self.data = np.asarray(self.data, dtype='M8[ns]') - elif self.dtype == 'date': + elif dtype == u'date': self.data = np.array( [date.fromtimestamp(v) for v in self.data], dtype=object) - elif self.dtype == 'datetime': + elif dtype == u'datetime': self.data = np.array( [datetime.fromtimestamp(v) for v in self.data], dtype=object) else: try: - self.data = self.data.astype(self.dtype) + self.data = self.data.astype(dtype) except: self.data = self.data.astype('O') - # convert nans - if self.kind == 'string': - self.data = lib.array_replace_from_nan_rep( - self.data.ravel(), nan_rep).reshape(self.data.shape) + # convert nans / decode + if _ensure_decoded(self.kind) == u'string': + self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1471,7 +1498,7 @@ class DataIndexableCol(DataCol): @property def is_searchable(self): - return self.kind == 'string' + return _ensure_decoded(self.kind) == u'string' def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize) @@ -1504,9 +1531,10 @@ class Storer(object): ndim = None is_table = False - def __init__(self, parent, group, **kwargs): + def __init__(self, parent, group, encoding=None, **kwargs): self.parent = parent self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1515,7 +1543,7 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = getattr(self.group._v_attrs,'pandas_version',None) + version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1525,7 +1553,7 @@ def set_version(self): @property def pandas_type(self): - return getattr(self.group._v_attrs, 'pandas_type', None) + return _ensure_decoded(getattr(self.group._v_attrs, 'pandas_type', None)) def __repr__(self): """ return a pretty representatgion of myself """ @@ -1674,10 +1702,18 @@ def validate_read(self, kwargs): def is_exists(self): return True + def set_attrs(self): + """ set our object attributes """ + self.attrs.encoding = self.encoding + def get_attrs(self): """ retrieve our attributes """ + self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) for n in self.attributes: - setattr(self,n,getattr(self.attrs, n, None)) + setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) + + def write(self, obj, **kwargs): + self.set_attrs() def read_array(self, key): """ read an array for the specified node (off of group """ @@ -1700,7 +1736,7 @@ def read_array(self, key): else: ret = data - if dtype == 'datetime64': + if dtype == u'datetime64': ret = np.array(ret, dtype='M8[ns]') if transposed: @@ -1709,15 +1745,15 @@ def read_array(self, key): return ret def read_index(self, key): - variety = getattr(self.attrs, '%s_variety' % key) + variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) - if variety == 'multi': + if variety == u'multi': return self.read_multi_index(key) - elif variety == 'block': + elif variety == u'block': return self.read_block_index(key) - elif variety == 'sparseint': + elif variety == u'sparseint': return self.read_sparse_intindex(key) - elif variety == 'regular': + elif variety == u'regular': _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover @@ -1735,7 +1771,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index).set_name('index') + converted = _convert_index(index,self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1782,7 +1818,7 @@ def write_multi_index(self, key, index): index.names)): # write the level level_key = '%s_level%d' % (key, i) - conv_level = _convert_index(lev).set_name(level_key) + conv_level = _convert_index(lev, self.encoding).set_name(level_key) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind @@ -1815,7 +1851,7 @@ def read_multi_index(self, key): def read_index_node(self, node): data = node[:] - kind = node._v_attrs.kind + kind = _ensure_decoded(node._v_attrs.kind) name = None if 'name' in node._v_attrs: @@ -1826,23 +1862,22 @@ def read_index_node(self, node): factory = self._get_index_factory(index_class) kwargs = {} - if 'freq' in node._v_attrs: + if u'freq' in node._v_attrs: kwargs['freq'] = node._v_attrs['freq'] - if 'tz' in node._v_attrs: + if u'tz' in node._v_attrs: kwargs['tz'] = node._v_attrs['tz'] - if kind in ('date', 'datetime'): - index = factory(_unconvert_index(data, kind), dtype=object, + if kind in (u'date', u'datetime'): + index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, **kwargs) else: - index = factory(_unconvert_index(data, kind), **kwargs) + index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) index.name = name return name, index - def write_array_empty(self, key, value): """ write a 0-len array """ @@ -1922,7 +1957,7 @@ def read_index_legacy(self, key): node = getattr(self.group,key) data = node[:] kind = node._v_attrs.kind - return _unconvert_index_legacy(data, kind) + return _unconvert_index_legacy(data, kind, encoding=self.encoding) class LegacySeriesStorer(LegacyStorer): @@ -1942,7 +1977,7 @@ def read(self, **kwargs): return DataFrame(values, index=index, columns=columns) class SeriesStorer(GenericStorer): - pandas_kind = 'series' + pandas_kind = u'series' attributes = ['name'] @property @@ -1963,12 +1998,13 @@ def read(self, **kwargs): return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): + super(SeriesStorer, self).write(obj, **kwargs) self.write_index('index', obj.index) self.write_array('values', obj.values) self.attrs.name = obj.name class SparseSeriesStorer(GenericStorer): - pandas_kind = 'sparse_series' + pandas_kind = u'sparse_series' attributes = ['name','fill_value','kind'] def read(self, **kwargs): @@ -1977,10 +2013,11 @@ def read(self, **kwargs): sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') return SparseSeries(sp_values, index=index, sparse_index=sp_index, - kind=self.kind or 'block', fill_value=self.fill_value, + kind=self.kind or u'block', fill_value=self.fill_value, name=self.name) def write(self, obj, **kwargs): + super(SparseSeriesStorer, self).write(obj, **kwargs) self.write_index('index', obj.index) self.write_index('sp_index', obj.sp_index) self.write_array('sp_values', obj.sp_values) @@ -1989,7 +2026,7 @@ def write(self, obj, **kwargs): self.attrs.kind = obj.kind class SparseFrameStorer(GenericStorer): - pandas_kind = 'sparse_frame' + pandas_kind = u'sparse_frame' attributes = ['default_kind','default_fill_value'] def read(self, **kwargs): @@ -2007,6 +2044,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): """ write it as a collection of individual sparse series """ + super(SparseFrameStorer, self).write(obj, **kwargs) for name, ss in obj.iteritems(): key = 'sparse_series_%s' % name if key not in self.group._v_children: @@ -2020,7 +2058,7 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) class SparsePanelStorer(GenericStorer): - pandas_kind = 'sparse_panel' + pandas_kind = u'sparse_panel' attributes = ['default_kind','default_fill_value'] def read(self, **kwargs): @@ -2038,11 +2076,12 @@ def read(self, **kwargs): default_fill_value=self.default_fill_value) def write(self, obj, **kwargs): + super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) - for name, sdf in obj.iteritems(): + for name, sdf in obj.iterkv(): key = 'sparse_frame_%s' % name if key not in self.group._v_children: node = self._handle.createGroup(self.group, key) @@ -2105,6 +2144,7 @@ def read(self, **kwargs): return self.obj_type(BlockManager(blocks, axes)) def write(self, obj, **kwargs): + super(BlockManagerStorer, self).write(obj, **kwargs) data = obj._data if not data.is_consolidated(): data = data.consolidate() @@ -2122,11 +2162,11 @@ def write(self, obj, **kwargs): self.write_index('block%d_items' % i, blk.items) class FrameStorer(BlockManagerStorer): - pandas_kind = 'frame' + pandas_kind = u'frame' obj_type = DataFrame class PanelStorer(BlockManagerStorer): - pandas_kind = 'wide' + pandas_kind = u'wide' obj_type = Panel is_shape_reversed = True @@ -2151,7 +2191,7 @@ class Table(Storer): levels : the names of levels """ - pandas_kind = 'wide_table' + pandas_kind = u'wide_table' table_type = None levels = 1 is_table = True @@ -2225,7 +2265,7 @@ def nrows_expected(self): @property def is_exists(self): """ has this table been created """ - return 'table' in self.group + return u'table' in self.group @property def storable(self): @@ -2291,6 +2331,7 @@ def set_attrs(self): self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding self.attrs.levels = self.levels self.set_info() @@ -2300,6 +2341,7 @@ def get_attrs(self): self.data_columns = getattr(self.attrs,'data_columns',None) or [] self.info = getattr(self.attrs,'info',None) or dict() self.nan_rep = getattr(self.attrs,'nan_rep',None) + self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) self.levels = getattr(self.attrs,'levels',None) or [] t = self.table self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] @@ -2430,7 +2472,7 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: a.set_info(self.info) - a.convert(values, nan_rep=self.nan_rep) + a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) return True @@ -2473,6 +2515,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, validate: validate the obj against an existiing object already written min_itemsize: a dict of the min size for a column in bytes nan_rep : a values to use for string column nan_rep + encoding : the encoding for string values data_columns : a list of columns that we want to create separate to allow indexing (or True will force all colummns) """ @@ -2492,10 +2535,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep - self.info = copy.copy(existing_table.info) + axes = [ a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep + self.encoding = existing_table.encoding + self.info = copy.copy(existing_table.info) else: existing_table = None @@ -2510,6 +2554,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, # nan_representation if nan_rep is None: nan_rep = 'nan' + self.nan_rep = nan_rep # create axes to index and non_index @@ -2519,7 +2564,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if i in axes: name = obj._AXIS_NAMES[i] index_axes_map[i] = _convert_index( - a).set_name(name).set_axis(i) + a, self.encoding).set_name(name).set_axis(i) else: # we might be able to change the axes on the appending data if @@ -2595,6 +2640,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, + encoding=self.encoding, info=self.info, **kwargs) col.set_pos(j) @@ -2716,7 +2762,7 @@ def read_column(self, column, where = None, **kwargs): # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data()) + return Series(a.convert(c[:], nan_rep=self.nan_rep, encoding=self.encoding).take_data()) raise KeyError("column [%s] not found in the table" % column) @@ -2725,7 +2771,7 @@ class WORMTable(Table): table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk """ - table_type = 'worm' + table_type = u'worm' def read(self, **kwargs): """ read the indicies and the indexing array, calculate offset rows and @@ -2750,7 +2796,7 @@ class LegacyTable(Table): IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), DataCol(name='fields', cname='values', kind_attr='fields', pos=2)] - table_type = 'legacy' + table_type = u'legacy' ndim = 3 def write(self, **kwargs): @@ -2840,8 +2886,8 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): """ support the legacy frame table """ - pandas_kind = 'frame_table' - table_type = 'legacy_frame' + pandas_kind = u'frame_table' + table_type = u'legacy_frame' obj_type = Panel def read(self, *args, **kwargs): @@ -2850,14 +2896,14 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): """ support the legacy panel table """ - table_type = 'legacy_panel' + table_type = u'legacy_panel' obj_type = Panel class AppendableTable(LegacyTable): """ suppor the new appendable table formats """ _indexables = None - table_type = 'appendable' + table_type = u'appendable' def write(self, obj, axes=None, append=False, complib=None, complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, @@ -2868,7 +2914,8 @@ def write(self, obj, axes=None, append=False, complib=None, # create the axes self.create_axes(axes=axes, obj=obj, validate=append, - min_itemsize=min_itemsize, **kwargs) + min_itemsize=min_itemsize, + **kwargs) if not self.is_exists: @@ -3019,8 +3066,8 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): """ suppor the new appendable table formats """ - pandas_kind = 'frame_table' - table_type = 'appendable_frame' + pandas_kind = u'frame_table' + table_type = u'appendable_frame' ndim = 2 obj_type = DataFrame @@ -3074,8 +3121,8 @@ def read(self, where=None, columns=None, **kwargs): class GenericTable(AppendableFrameTable): """ a table that read/writes the generic pytables table format """ - pandas_kind = 'frame_table' - table_type = 'generic_table' + pandas_kind = u'frame_table' + table_type = u'generic_table' ndim = 2 obj_type = DataFrame @@ -3119,13 +3166,13 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): """ a frame with a multi-index """ - table_type = 'appendable_multiframe' + table_type = u'appendable_multiframe' obj_type = DataFrame ndim = 2 @property def table_type_short(self): - return 'appendable_multi' + return u'appendable_multi' def write(self, obj, data_columns=None, **kwargs): if data_columns is None: @@ -3150,7 +3197,7 @@ def read(self, columns=None, **kwargs): class AppendablePanelTable(AppendableTable): """ suppor the new appendable table formats """ - table_type = 'appendable_panel' + table_type = u'appendable_panel' ndim = 3 obj_type = Panel @@ -3167,11 +3214,11 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): """ suppor the new appendable table formats """ - table_type = 'appendable_ndim' + table_type = u'appendable_ndim' ndim = 4 obj_type = Panel4D -def _convert_index(index): +def _convert_index(index, encoding=None): index_name = getattr(index,'name',None) if isinstance(index, DatetimeIndex): @@ -3211,7 +3258,7 @@ def _convert_index(index): # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom - converted = np.array(list(values), dtype=np.str_) + converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, index_name=index_name) @@ -3233,48 +3280,90 @@ def _convert_index(index): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) -def _unconvert_index(data, kind): - if kind == 'datetime64': +def _unconvert_index(data, kind, encoding=None): + kind = _ensure_decoded(kind) + if kind == u'datetime64': index = DatetimeIndex(data) - elif kind == 'datetime': + elif kind == u'datetime': index = np.array([datetime.fromtimestamp(v) for v in data], dtype=object) - elif kind == 'date': + elif kind == u'date': index = np.array([date.fromtimestamp(v) for v in data], dtype=object) - elif kind in ('string', 'integer', 'float'): + elif kind in (u'integer', u'float'): index = np.array(data) - elif kind == 'object': + elif kind in (u'string'): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) + elif kind == u'object': index = np.array(data[0]) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index -def _unconvert_index_legacy(data, kind, legacy=False): - if kind == 'datetime': +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): + kind = _ensure_decoded(kind) + if kind == u'datetime': index = lib.time64_to_datetime(data) - elif kind in ('string', 'integer'): + elif kind in (u'integer'): index = np.array(data, dtype=object) + elif kind in (u'string'): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index -def _maybe_convert(values, val_kind): +def _convert_string_array(data, encoding, itemsize=None): + + # encode if needed + if encoding is not None and len(data): + f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object]) + data = f(data) + + # create the sized dtype + if itemsize is None: + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + + data = np.array(data,dtype="S%d" % itemsize) + return data + +def _unconvert_string_array(data, nan_rep=None, encoding=None): + """ deserialize a string array, possibly decoding """ + shape = data.shape + data = np.array(data.ravel(),dtype=object) + + # guard against a None encoding in PY3 (because of a legacy + # where the passed encoding is actually None) + encoding = _ensure_encoding(encoding) + if encoding is not None and len(data): + f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + data = f(data) + + if nan_rep is None: + nan_rep = 'nan' + + data = lib.string_array_replace_from_nan_rep(data, nan_rep) + return data.reshape(shape) + +def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): - conv = _get_converter(val_kind) + conv = _get_converter(val_kind, encoding) # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind): +def _get_converter(kind, encoding): + kind = _ensure_decoded(kind) if kind == 'datetime64': return lambda x: np.array(x, dtype='M8[ns]') - if kind == 'datetime': + elif kind == 'datetime': return lib.convert_timestamps + elif kind == 'string': + return lambda x: _unconvert_string_array(x,encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) def _need_convert(kind): - if kind in ('datetime', 'datetime64'): + kind = _ensure_decoded(kind) + if kind in (u'datetime', u'datetime64', u'string'): return True return False @@ -3288,7 +3377,8 @@ class Term(object): >, >=, <, <=, =, != (not equal) are allowed value : a value or list of values (required) queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable - + encoding : an encoding that will encode the query terms + Returns ------- a Term object @@ -3301,20 +3391,21 @@ class Term(object): >>> Term('index', ['20121114','20121114']) >>> Term('index', datetime(2012,11,14)) >>> Term('major_axis>20121114') - >>> Term('minor_axis', ['A','B']) + >>> Term('minor_axis', ['A','U']) """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None): + def __init__(self, field, op=None, value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None self.q = queryables or dict() self.filter = None self.condition = None + self.encoding = encoding # unpack lists/tuples in field while(isinstance(field, (tuple, list))): @@ -3366,16 +3457,16 @@ def __init__(self, field, op=None, value=None, queryables=None): if self.field is None or self.op is None or self.value is None: raise ValueError("Could not create this term [%s]" % str(self)) - # = vs == + # = vs == if self.op == '=': self.op = '==' # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1: + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,basestring): raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) - if not hasattr(self.value, '__iter__'): + if not is_list_like(self.value): self.value = [self.value] if len(self.q): @@ -3401,6 +3492,11 @@ def kind(self): """ the kind of my field """ return self.q.get(self.field) + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.field, self.op, val) + def eval(self): """ set the numexpr expression for this term """ @@ -3411,40 +3507,39 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [[v, v] for v in self.value] + values = [TermValue(v,v,self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: # our filter op expression if self.op == '!=': - filter_op = lambda axis, values: not axis.isin(values) + filter_op = lambda axis, vals: not axis.isin(vals) else: - filter_op = lambda axis, values: axis.isin(values) + filter_op = lambda axis, vals: axis.isin(vals) if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - self.condition = "(%s)" % ' | '.join( - ["(%s %s %s)" % (self.field, self.op, v[0]) for v in values]) + vs = [ self.generate(v) for v in values ] + self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v[1] for v in values])) + self.filter = (self.field, filter_op, Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v[1] for v in values])) + self.filter = (self.field, filter_op, Index([v.value for v in values])) else: if self.is_in_table: - self.condition = '(%s %s %s)' % ( - self.field, self.op, values[0][0]) - + self.condition = self.generate(values[0]) + else: raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) @@ -3452,33 +3547,56 @@ def eval(self): def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ - if self.kind == 'datetime64' or self.kind == 'datetime' : + def stringify(value): + value = str(value) + if self.encoding is not None: + value = value.encode(self.encoding) + return value + + kind = _ensure_decoded(self.kind) + if kind == u'datetime64' or kind == u'datetime' : v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return [v.value, v] - elif isinstance(v, datetime) or hasattr(v, 'timetuple') or self.kind == 'date': + return TermValue(v,v.value,kind) + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) - return [v, Timestamp(v) ] - elif self.kind == 'integer': + return TermValue(v,Timestamp(v),kind) + elif kind == u'integer': v = int(float(v)) - return [v, v] - elif self.kind == 'float': + return TermValue(v,v,kind) + elif kind == u'float': v = float(v) - return [v, v] - elif self.kind == 'bool': + return TermValue(v,v,kind) + elif kind == u'bool': if isinstance(v, basestring): - v = not str(v).strip().lower() in ["false", "f", "no", "n", "none", "0", "[]", "{}", ""] + v = not v.strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) - return [v, v] + return TermValue(v,v,kind) elif not isinstance(v, basestring): - v = str(v) - return [v, v] + v = stringify(v) + return TermValue(v,stringify(v),u'string') # string quoting - return ["'" + v + "'", v] + return TermValue(v,stringify(v),u'string') + +class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u'string': + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted class Coordinates(object): """ holds a returned coordinates list, useful to select the same rows from different tables @@ -3528,9 +3646,9 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - conds = [t.condition for t in self.terms if t.condition is not None] - if len(conds): - self.condition = "(%s)" % ' & '.join(conds) + terms = [ t for t in self.terms if t.condition is not None ] + if len(terms): + self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3553,7 +3671,7 @@ def generate(self, where): where = [where] queryables = self.table.queryables() - return [Term(c, queryables=queryables) for c in where] + return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d0f03774f2070..8b3d4a475d952 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -17,6 +17,7 @@ from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal from pandas import concat, Timestamp +from pandas.util import py3compat from numpy.testing.decorators import slow @@ -115,7 +116,7 @@ def roundtrip(key, obj,**kwargs): o = tm.makeTimeSeries() assert_series_equal(o, roundtrip('series',o)) - + o = tm.makeStringSeries() assert_series_equal(o, roundtrip('string_series',o)) @@ -474,6 +475,20 @@ def test_append(self): store.append('uints', uint_data, data_columns=['u08','u16','u32']) # 64-bit indices not yet supported tm.assert_frame_equal(store['uints'], uint_data) + def test_encoding(self): + + with ensure_clean(self.path) as store: + df = DataFrame(dict(A='foo',B='bar'),index=range(5)) + df.loc[2,'A'] = np.nan + df.loc[3,'B'] = np.nan + _maybe_remove(store, 'df') + store.append('df', df, encoding='ascii') + tm.assert_frame_equal(store['df'], df) + + expected = df.reindex(columns=['A']) + result = store.select('df',Term('columns=A',encoding='ascii')) + tm.assert_frame_equal(result,expected) + def test_append_some_nans(self): with ensure_clean(self.path) as store: @@ -556,6 +571,7 @@ def test_append_some_nans(self): def test_append_frame_column_oriented(self): with ensure_clean(self.path) as store: + # column oriented df = tm.makeTimeDataFrame() _maybe_remove(store, 'df1') @@ -1261,8 +1277,14 @@ def test_unimplemented_dtypes_table_columns(self): with ensure_clean(self.path) as store: + l = [('date', datetime.date(2001, 1, 2))] + + # py3 ok for unicode + if not py3compat.PY3: + l.append(('unicode', u'\u03c3')) + ### currently not supported dtypes #### - for n, f in [('unicode', u'\u03c3'), ('date', datetime.date(2001, 1, 2))]: + for n, f in l: df = tm.makeDataFrame() df[n] = f self.assertRaises( @@ -2545,6 +2567,7 @@ def test_legacy_0_10_read(self): # legacy from 0.10 try: store = HDFStore(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), 'r') + str(store) for k in store.keys(): store.select(k) finally: @@ -2554,6 +2577,7 @@ def test_legacy_0_11_read(self): # legacy from 0.11 try: store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + str(store) df = store.select('df') df1 = store.select('df1') mi = store.select('mi') @@ -2585,24 +2609,25 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): # check indicies & nrows for k in tstore.keys(): - if tstore.is_table(k): + if tstore.get_storer(k).is_table: new_t = tstore.get_storer(k) orig_t = store.get_storer(k) self.assert_(orig_t.nrows == new_t.nrows) - for a in orig_t.axes: - if a.is_indexed: - self.assert_(new_t[a.name].is_indexed == True) - except (Exception), detail: - pass + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + self.assert_(new_t[a.name].is_indexed == True) + finally: safe_close(store) safe_close(tstore) safe_remove(new_f) do_copy() - do_copy(keys = ['df']) + do_copy(keys = ['/a','/b','/df1_mixed']) do_copy(propindexes = False) # new table diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 15791a984ecc5..a80ad5b7d0208 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -14,6 +14,7 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, Py_INCREF, PyTuple_SET_ITEM, PyList_Check, PyFloat_Check, PyString_Check, + PyBytes_Check, PyTuple_SetItem, PyTuple_New, PyObject_SetAttrString) @@ -762,7 +763,7 @@ def max_len_string_array(ndarray[object, ndim=1] arr): m = 0 for i from 0 <= i < length: v = arr[i] - if PyString_Check(v): + if PyString_Check(v) or PyBytes_Check(v): l = len(v) if l > m: @@ -772,11 +773,10 @@ def max_len_string_array(ndarray[object, ndim=1] arr): @cython.boundscheck(False) @cython.wraparound(False) -def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): +def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): """ replace the values in the array with replacement if they are nan_rep; return the same array """ - cdef int length = arr.shape[0] - cdef int i = 0 + cdef int length = arr.shape[0], i = 0 if replace is None: replace = np.nan @@ -788,7 +788,6 @@ def array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, obje @cython.boundscheck(False) @cython.wraparound(False) - def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): cdef int N, j, i, ncols