-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Allow errors
keyword for HDF IO Encoding Err Handling
#20873
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
f75ca6e
97f6a54
9ae2ea0
cfe09d1
3973ef7
61a0c6b
0fe838a
9a13234
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -308,6 +308,10 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): | |
return columns | ||
iterator : optional, boolean, return an iterator, default False | ||
chunksize : optional, nrows to include in iteration, return an iterator | ||
errors : str, default 'strict' | ||
Specifies how encoding and decoding errors are to be handled. | ||
See the errors argument for :func:`open` for a full list | ||
of options. | ||
|
||
Returns | ||
------- | ||
|
@@ -705,7 +709,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, | |
def func(_start, _stop, _where): | ||
return s.read(start=_start, stop=_stop, | ||
where=_where, | ||
columns=columns, **kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed this |
||
columns=columns) | ||
|
||
# create the iterator | ||
it = TableIterator(self, s, func, where=where, nrows=s.nrows, | ||
|
@@ -1566,14 +1570,14 @@ def infer(self, handler): | |
new_self.read_metadata(handler) | ||
return new_self | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
""" set the values from this selection: take = take ownership """ | ||
|
||
# values is a recarray | ||
if values.dtype.fields is not None: | ||
values = values[self.cname] | ||
|
||
values = _maybe_convert(values, self.kind, encoding) | ||
values = _maybe_convert(values, self.kind, encoding, errors) | ||
|
||
kwargs = dict() | ||
if self.freq is not None: | ||
|
@@ -1748,7 +1752,7 @@ class GenericIndexCol(IndexCol): | |
def is_indexed(self): | ||
return False | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
""" set the values from this selection: take = take ownership """ | ||
|
||
self.values = Int64Index(np.arange(self.table.nrows)) | ||
|
@@ -1877,7 +1881,7 @@ def set_kind(self): | |
self.typ = getattr(self.description, self.cname, None) | ||
|
||
def set_atom(self, block, block_items, existing_col, min_itemsize, | ||
nan_rep, info, encoding=None, **kwargs): | ||
nan_rep, info, encoding=None, errors='strict'): | ||
""" create and setup my atom from the block b """ | ||
|
||
self.values = list(block_items) | ||
|
@@ -1922,7 +1926,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, | |
existing_col, | ||
min_itemsize, | ||
nan_rep, | ||
encoding) | ||
encoding, | ||
errors) | ||
|
||
# set as a data block | ||
else: | ||
|
@@ -1932,7 +1937,7 @@ def get_atom_string(self, block, itemsize): | |
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) | ||
|
||
def set_atom_string(self, block, block_items, existing_col, min_itemsize, | ||
nan_rep, encoding): | ||
nan_rep, encoding, errors): | ||
# fill nan items with myself, don't disturb the blocks by | ||
# trying to downcast | ||
block = block.fillna(nan_rep, downcast=False) | ||
|
@@ -1958,7 +1963,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, | |
) | ||
|
||
# itemsize is the maximum length of a string (along any dimension) | ||
data_converted = _convert_string_array(data, encoding) | ||
data_converted = _convert_string_array(data, encoding, errors) | ||
itemsize = data_converted.itemsize | ||
|
||
# specified min_itemsize? | ||
|
@@ -2089,7 +2094,7 @@ def validate_attr(self, append): | |
raise ValueError("appended items dtype do not match existing " | ||
"items dtype in table!") | ||
|
||
def convert(self, values, nan_rep, encoding): | ||
def convert(self, values, nan_rep, encoding, errors): | ||
"""set the data from this selection (and convert to the correct dtype | ||
if we can) | ||
""" | ||
|
@@ -2163,7 +2168,7 @@ def convert(self, values, nan_rep, encoding): | |
# convert nans / decode | ||
if _ensure_decoded(self.kind) == u('string'): | ||
self.data = _unconvert_string_array( | ||
self.data, nan_rep=nan_rep, encoding=encoding) | ||
self.data, nan_rep=nan_rep, encoding=encoding, errors=errors) | ||
|
||
return self | ||
|
||
|
@@ -2229,10 +2234,12 @@ class Fixed(StringMixin): | |
ndim = None | ||
is_table = False | ||
|
||
def __init__(self, parent, group, encoding=None, **kwargs): | ||
def __init__(self, parent, group, encoding=None, errors='strict', | ||
**kwargs): | ||
self.parent = parent | ||
self.group = group | ||
self.encoding = _ensure_encoding(encoding) | ||
self.errors = errors | ||
self.set_version() | ||
|
||
@property | ||
|
@@ -2436,10 +2443,12 @@ def is_exists(self): | |
def set_attrs(self): | ||
""" set our object attributes """ | ||
self.attrs.encoding = self.encoding | ||
self.attrs.errors = self.errors | ||
|
||
def get_attrs(self): | ||
""" retrieve our attributes """ | ||
self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) | ||
self.errors = getattr(self.attrs, 'errors', 'strict') | ||
for n in self.attributes: | ||
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) | ||
|
||
|
@@ -2506,7 +2515,7 @@ def write_index(self, key, index): | |
self.write_sparse_intindex(key, index) | ||
else: | ||
setattr(self.attrs, '%s_variety' % key, 'regular') | ||
converted = _convert_index(index, self.encoding, | ||
converted = _convert_index(index, self.encoding, self.errors, | ||
self.format_type).set_name('index') | ||
|
||
self.write_array(key, converted.values) | ||
|
@@ -2552,7 +2561,7 @@ def write_multi_index(self, key, index): | |
index.names)): | ||
# write the level | ||
level_key = '%s_level%d' % (key, i) | ||
conv_level = _convert_index(lev, self.encoding, | ||
conv_level = _convert_index(lev, self.encoding, self.errors, | ||
self.format_type).set_name(level_key) | ||
self.write_array(level_key, conv_level.values) | ||
node = getattr(self.group, level_key) | ||
|
@@ -2613,11 +2622,13 @@ def read_index_node(self, node, start=None, stop=None): | |
|
||
if kind in (u('date'), u('datetime')): | ||
index = factory(_unconvert_index(data, kind, | ||
encoding=self.encoding), | ||
encoding=self.encoding, | ||
errors=self.errors), | ||
dtype=object, **kwargs) | ||
else: | ||
index = factory(_unconvert_index(data, kind, | ||
encoding=self.encoding), **kwargs) | ||
encoding=self.encoding, | ||
errors=self.errors), **kwargs) | ||
|
||
index.name = name | ||
|
||
|
@@ -2730,7 +2741,8 @@ def read_index_legacy(self, key, start=None, stop=None): | |
node = getattr(self.group, key) | ||
data = node[start:stop] | ||
kind = node._v_attrs.kind | ||
return _unconvert_index_legacy(data, kind, encoding=self.encoding) | ||
return _unconvert_index_legacy(data, kind, encoding=self.encoding, | ||
errors=self.errors) | ||
|
||
|
||
class LegacySeriesFixed(LegacyFixed): | ||
|
@@ -3149,7 +3161,8 @@ def write_metadata(self, key, values): | |
""" | ||
values = Series(values) | ||
self.parent.put(self._get_metadata_path(key), values, format='table', | ||
encoding=self.encoding, nan_rep=self.nan_rep) | ||
encoding=self.encoding, errors=self.errors, | ||
nan_rep=self.nan_rep) | ||
|
||
def read_metadata(self, key): | ||
""" return the meta data array for this key """ | ||
|
@@ -3170,6 +3183,7 @@ def set_attrs(self): | |
self.attrs.data_columns = self.data_columns | ||
self.attrs.nan_rep = self.nan_rep | ||
self.attrs.encoding = self.encoding | ||
self.attrs.errors = self.errors | ||
self.attrs.levels = self.levels | ||
self.attrs.metadata = self.metadata | ||
self.set_info() | ||
|
@@ -3185,6 +3199,7 @@ def get_attrs(self): | |
self.nan_rep = getattr(self.attrs, 'nan_rep', None) | ||
self.encoding = _ensure_encoding( | ||
getattr(self.attrs, 'encoding', None)) | ||
self.errors = getattr(self.attrs, 'errors', 'strict') | ||
self.levels = getattr( | ||
self.attrs, 'levels', None) or [] | ||
self.index_axes = [ | ||
|
@@ -3342,7 +3357,8 @@ def read_axes(self, where, **kwargs): | |
# convert the data | ||
for a in self.axes: | ||
a.set_info(self.info) | ||
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) | ||
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, | ||
errors=self.errors) | ||
|
||
return True | ||
|
||
|
@@ -3424,6 +3440,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, | |
data_columns = existing_table.data_columns | ||
nan_rep = existing_table.nan_rep | ||
self.encoding = existing_table.encoding | ||
self.errors = existing_table.errors | ||
self.info = copy.copy(existing_table.info) | ||
else: | ||
existing_table = None | ||
|
@@ -3450,7 +3467,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, | |
if i in axes: | ||
name = obj._AXIS_NAMES[i] | ||
index_axes_map[i] = _convert_index( | ||
a, self.encoding, self.format_type | ||
a, self.encoding, self.errors, self.format_type | ||
).set_name(name).set_axis(i) | ||
else: | ||
|
||
|
@@ -3569,8 +3586,8 @@ def get_blk_items(mgr, blocks): | |
min_itemsize=min_itemsize, | ||
nan_rep=nan_rep, | ||
encoding=self.encoding, | ||
info=self.info, | ||
**kwargs) | ||
errors=self.errors, | ||
info=self.info) | ||
col.set_pos(j) | ||
|
||
self.values_axes.append(col) | ||
|
@@ -3734,7 +3751,8 @@ def read_column(self, column, where=None, start=None, stop=None, **kwargs): | |
a.set_info(self.info) | ||
return Series(_set_tz(a.convert(c[start:stop], | ||
nan_rep=self.nan_rep, | ||
encoding=self.encoding | ||
encoding=self.encoding, | ||
errors=self.errors | ||
).take_data(), | ||
a.tz, True), name=column) | ||
|
||
|
@@ -4415,7 +4433,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): | |
return values | ||
|
||
|
||
def _convert_index(index, encoding=None, format_type=None): | ||
def _convert_index(index, encoding=None, errors='strict', format_type=None): | ||
index_name = getattr(index, 'name', None) | ||
|
||
if isinstance(index, DatetimeIndex): | ||
|
@@ -4469,7 +4487,7 @@ def _convert_index(index, encoding=None, format_type=None): | |
# atom = _tables().ObjectAtom() | ||
# return np.asarray(values, dtype='O'), 'object', atom | ||
|
||
converted = _convert_string_array(values, encoding) | ||
converted = _convert_string_array(values, encoding, errors) | ||
itemsize = converted.dtype.itemsize | ||
return IndexCol( | ||
converted, 'string', _tables().StringCol(itemsize), | ||
|
@@ -4500,7 +4518,7 @@ def _convert_index(index, encoding=None, format_type=None): | |
index_name=index_name) | ||
|
||
|
||
def _unconvert_index(data, kind, encoding=None): | ||
def _unconvert_index(data, kind, encoding=None, errors='strict'): | ||
kind = _ensure_decoded(kind) | ||
if kind == u('datetime64'): | ||
index = DatetimeIndex(data) | ||
|
@@ -4519,28 +4537,31 @@ def _unconvert_index(data, kind, encoding=None): | |
elif kind in (u('integer'), u('float')): | ||
index = np.asarray(data) | ||
elif kind in (u('string')): | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, | ||
errors=errors) | ||
elif kind == u('object'): | ||
index = np.asarray(data[0]) | ||
else: # pragma: no cover | ||
raise ValueError('unrecognized index type %s' % kind) | ||
return index | ||
|
||
|
||
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): | ||
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, | ||
errors='strict'): | ||
kind = _ensure_decoded(kind) | ||
if kind == u('datetime'): | ||
index = to_datetime(data) | ||
elif kind in (u('integer')): | ||
index = np.asarray(data, dtype=object) | ||
elif kind in (u('string')): | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) | ||
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, | ||
errors=errors) | ||
else: # pragma: no cover | ||
raise ValueError('unrecognized index type %s' % kind) | ||
return index | ||
|
||
|
||
def _convert_string_array(data, encoding, itemsize=None): | ||
def _convert_string_array(data, encoding, errors, itemsize=None): | ||
""" | ||
we take a string-like that is object dtype and coerce to a fixed size | ||
string type | ||
|
@@ -4549,6 +4570,7 @@ def _convert_string_array(data, encoding, itemsize=None): | |
---------- | ||
data : a numpy array of object dtype | ||
encoding : None or string-encoding | ||
errors : handler for encoding errors | ||
itemsize : integer, optional, defaults to the max length of the strings | ||
|
||
Returns | ||
|
@@ -4559,7 +4581,7 @@ def _convert_string_array(data, encoding, itemsize=None): | |
# encode if needed | ||
if encoding is not None and len(data): | ||
data = Series(data.ravel()).str.encode( | ||
encoding).values.reshape(data.shape) | ||
encoding, errors).values.reshape(data.shape) | ||
|
||
# create the sized dtype | ||
if itemsize is None: | ||
|
@@ -4570,7 +4592,8 @@ def _convert_string_array(data, encoding, itemsize=None): | |
return data | ||
|
||
|
||
def _unconvert_string_array(data, nan_rep=None, encoding=None): | ||
def _unconvert_string_array(data, nan_rep=None, encoding=None, | ||
errors='strict'): | ||
""" | ||
inverse of _convert_string_array | ||
|
||
|
@@ -4579,6 +4602,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
data : fixed length string dtyped array | ||
nan_rep : the storage repr of NaN, optional | ||
encoding : the encoding of the data, optional | ||
errors : handler for encoding errors, default 'strict' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you show options and/or point to the python ref for these |
||
|
||
Returns | ||
------- | ||
|
@@ -4600,7 +4624,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
dtype = "S{0}".format(itemsize) | ||
|
||
if isinstance(data[0], compat.binary_type): | ||
data = Series(data).str.decode(encoding).values | ||
data = Series(data).str.decode(encoding, errors=errors).values | ||
else: | ||
data = data.astype(dtype, copy=False).astype(object, copy=False) | ||
|
||
|
@@ -4611,22 +4635,23 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): | |
return data.reshape(shape) | ||
|
||
|
||
def _maybe_convert(values, val_kind, encoding): | ||
def _maybe_convert(values, val_kind, encoding, errors): | ||
if _need_convert(val_kind): | ||
conv = _get_converter(val_kind, encoding) | ||
conv = _get_converter(val_kind, encoding, errors) | ||
# conv = np.frompyfunc(conv, 1, 1) | ||
values = conv(values) | ||
return values | ||
|
||
|
||
def _get_converter(kind, encoding): | ||
def _get_converter(kind, encoding, errors): | ||
kind = _ensure_decoded(kind) | ||
if kind == 'datetime64': | ||
return lambda x: np.asarray(x, dtype='M8[ns]') | ||
elif kind == 'datetime': | ||
return lambda x: to_datetime(x, cache=True).to_pydatetime() | ||
elif kind == 'string': | ||
return lambda x: _unconvert_string_array(x, encoding=encoding) | ||
return lambda x: _unconvert_string_array(x, encoding=encoding, | ||
errors=errors) | ||
else: # pragma: no cover | ||
raise ValueError('invalid kind %s' % kind) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TomAugspurger I know you are adding a few things for the RC so don't need to change anything here, but do we typically document things in the API like this? Wondering if we shouldn't make all of the documented features actual keyword arguments in the call signature rather than tucking them away in kwargs.
FWIW if we have errors here we'd probably want to add
encoding
as wellThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The signature should be changed from kwargs to reflect the actual signature. I thought we had an issue for it, but didn't find one. Opened #20903
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I'll take a stab at that one later